From 9c3118a8252306accab2fdd97622b5a76d071029 Mon Sep 17 00:00:00 2001 From: "iap10@labyrinth.cl.cam.ac.uk" Date: Wed, 15 Dec 2004 23:09:11 +0000 Subject: [PATCH] bitkeeper revision 1.1159.1.483 (41c0c417XYObowWqbfqU0cdLx30C9w) Initial Intel VMX changes to support unmodified Linux guests on Intel's VT p latform. --- .rootkeys | 10 + docs/misc/VMX_changes.txt | 90 +++ xen/arch/x86/domain.c | 130 +++- xen/arch/x86/io_apic.c | 8 + xen/arch/x86/setup.c | 5 + xen/arch/x86/shadow.c | 172 ++++-- xen/arch/x86/time.c | 2 +- xen/arch/x86/vmx.c | 913 +++++++++++++++++++++++++++++ xen/arch/x86/vmx_io.c | 234 ++++++++ xen/arch/x86/vmx_vmcs.c | 503 ++++++++++++++++ xen/arch/x86/x86_32/entry.S | 98 ++++ xen/common/event_channel.c | 2 +- xen/common/kernel.c | 4 +- xen/common/softirq.c | 1 + xen/include/asm-x86/config.h | 1 + xen/include/asm-x86/cpufeature.h | 2 + xen/include/asm-x86/e820.h | 42 ++ xen/include/asm-x86/mm.h | 9 + xen/include/asm-x86/msr.h | 4 + xen/include/asm-x86/processor.h | 72 ++- xen/include/asm-x86/shadow.h | 217 ++++++- xen/include/asm-x86/vmx.h | 251 ++++++++ xen/include/asm-x86/vmx_cpu.h | 35 ++ xen/include/asm-x86/vmx_platform.h | 24 + xen/include/asm-x86/vmx_vmcs.h | 225 +++++++ xen/include/public/arch-x86_32.h | 1 + xen/include/public/io/ioreq.h | 59 ++ xen/include/xen/sched.h | 2 - xen/include/xen/types.h | 2 + 29 files changed, 3024 insertions(+), 94 deletions(-) create mode 100644 docs/misc/VMX_changes.txt create mode 100644 xen/arch/x86/vmx.c create mode 100644 xen/arch/x86/vmx_io.c create mode 100644 xen/arch/x86/vmx_vmcs.c create mode 100644 xen/include/asm-x86/e820.h create mode 100644 xen/include/asm-x86/vmx.h create mode 100644 xen/include/asm-x86/vmx_cpu.h create mode 100644 xen/include/asm-x86/vmx_platform.h create mode 100644 xen/include/asm-x86/vmx_vmcs.h create mode 100644 xen/include/public/io/ioreq.h diff --git a/.rootkeys b/.rootkeys index c15c15cda5..c6ce0aa40a 100644 --- a/.rootkeys +++ b/.rootkeys @@ -15,6 +15,7 @@ 4187c1c7IWmBinGdI19kL4MuZ6RLbQ docs/check_pkgs 3f9e7d60PWZJeVh5xdnk0nLUdxlqEA docs/figs/xenlogo.eps 418a3248xjIqmNKo0v_XQSfAvlBGFw docs/html.sty +41c0c4116itF389v0CEWcmzue6zJkA docs/misc/VMX_changes.txt 4022a73cgxX1ryj1HgS-IwwB6NUi2A docs/misc/XenDebugger-HOWTO 412f4bd9sm5mCQ8BkrgKcAKZGadq7Q docs/misc/blkif-drivers-explained.txt 40d6ccbfKKBq8jE0ula4eHEzBiQuDA docs/misc/xen_config.html @@ -698,6 +699,9 @@ 3ddb79bc-Udq7ol-NX4q9XsYnN7A2Q xen/arch/x86/time.c 3ddb79bccYVzXZJyVaxuv5T42Z1Fsw xen/arch/x86/trampoline.S 3ddb79bcOftONV9h4QCxXOfiT0h91w xen/arch/x86/traps.c +41c0c411tD3C7TpfDMiFTf7BaNd_Dg xen/arch/x86/vmx.c +41c0c411ODt8uEmV-yUxpQLpqimE5Q xen/arch/x86/vmx_io.c +41c0c4128URE0dxcO15JME_MuKBPfg xen/arch/x86/vmx_vmcs.c 419cbedeQDg8IrO3izo3o5rQNlo0kQ xen/arch/x86/x86_32/asm-offsets.c 3e32af9aRnYGl4GMOaDKp7JdfhOGhg xen/arch/x86/x86_32/domain_page.c 3ddb79bcecupHj56ZbTa3B0FxDowMg xen/arch/x86/x86_32/entry.S @@ -808,6 +812,7 @@ 3ddb79c34BFiXjBJ_cCKB0aCsV1IDw xen/include/asm-x86/desc.h 40715b2dTokMLYGSuD58BnxOqyWVew xen/include/asm-x86/div64.h 3e20b82fl1jmQiKdLy7fxMcutfpjWA xen/include/asm-x86/domain_page.h +41c0c412Ufq5sAvri3dMHC1BXiO6Gw xen/include/asm-x86/e820.h 3ddb79c3NU8Zy40OTrq3D-i30Y3t4A xen/include/asm-x86/fixmap.h 3e2d29944GI24gf7vOP_7x8EyuqxeA xen/include/asm-x86/flushtlb.h 3ddb79c39o75zPP0T1aQQ4mNrCAN2w xen/include/asm-x86/hardirq.h @@ -837,6 +842,10 @@ 3ddb79c3ezddh34MdelJpa5tNR00Dw xen/include/asm-x86/system.h 3ddb79c4HugMq7IYGxcQKFBpKwKhzA xen/include/asm-x86/types.h 40cf1596saFaHD5DC5zvrSn7CDCWGQ xen/include/asm-x86/uaccess.h +41c0c412k6GHYF3cJtDdw37ee3TVaw xen/include/asm-x86/vmx.h +41c0c412hck3QX-6_MaXaISGkngQuA xen/include/asm-x86/vmx_cpu.h +41c0c41243jC1mcArZx_t3YkBL4lTA xen/include/asm-x86/vmx_platform.h +41c0c412lQ0NVVN9PsOSznQ-qhOiPA xen/include/asm-x86/vmx_vmcs.h 418fbcfe_WliJPToeVM-9VStvym-hw xen/include/asm-x86/x86_32/asm_defns.h 3ddb79c2ADvRmdexd9y3AYK9_NTx-Q xen/include/asm-x86/x86_32/current.h 3ddb79c3mbqEM7QQr3zVq7NiBNhouA xen/include/asm-x86/x86_32/regs.h @@ -857,6 +866,7 @@ 4121d149udGfSUGhn3k1ECz0bM31nQ xen/include/public/grant_table.h 40f5623bqoi4GEoBiiUc6TZk1HjsMg xen/include/public/io/blkif.h 40dc4076pVeE1kEEWzcUaNZin65kCA xen/include/public/io/domain_controller.h +41c0c412FLc0gunlJl91qMYscFtXVA xen/include/public/io/ioreq.h 40f5623cTZ80EwjWUBlh44A9F9i_Lg xen/include/public/io/netif.h 4051db79512nOCGweabrFWO2M2h5ng xen/include/public/physdev.h 40589968wmhPmV5-ENbBYmMjnedgKw xen/include/public/sched_ctl.h diff --git a/docs/misc/VMX_changes.txt b/docs/misc/VMX_changes.txt new file mode 100644 index 0000000000..739d315e79 --- /dev/null +++ b/docs/misc/VMX_changes.txt @@ -0,0 +1,90 @@ +Changes to Xen in support of Intel(R) Vanderpool Technology +------------------------------------------------------------- + +Our VT extensions to the Xen hypervisor provide full platform +virtualization, including CPU(s), memory, and I/O infrastructure. The +generic code in Xen handles and schedules those virtual machines as it +does for the existing para-virtualized domains. + +Full virtualization required by the OS guests requires full device +virtualization as well. The device models in BOCHS +(http://bochs.sourceforge.net/) were decoupled from the CPU +virtualization, and are used to virtualize the legacy devices (such as +keyboard, mouse, VGA, IDE) in the PC platform. At this point, the +device models run in user mode on domain 0, not in the Xen hypervisor. + +We would like to thank Ian Pratt and Keir Fraser for reviewing our +design and code intensively, and for providing numerous useful +suggestions to improve the architecture and code. + +We have a list of Intel team members who take credit for making this +release happen: Yunhong Jiang, Nitin Kamble, Chengyuan Li, Xin Li, +Xiaofeng Ling, Benjamin Liu, Asit Mallick, Jun Nakajima, Sunil Saxena, +Arun Sharma, Edwin Zhai, Jeff Zheng, and Louis Zhuang. We'll continue +to add more features to complete full virtualization in Xen using VT. + +The notes document the changes to the Xen hypervisor in order to add +VT support. The changes to other areas, such as Control Panel will be +added as we deliver the code. + +Summary of changes for the first release +---------------------------------------- +December 15, 2004 + + * VT specific event handling and domain management were added. + + * Shadow mode was extended to support full 32-bit guests + + * Domain switching code was extended to support VT domain + + * I/O request handling was added to communicate with the device model + + * Domain builder was extended to provide the environment when the + guest enters the protected mode, including E820 memory and VGA + info, typically obtained by BIOS calls. + +New code: +--------- + VT (Vanderpool Technology) is based on the new VMX (Virtual + Machine Extensions) architecture. The current release of the + software supports 32-bit only. + + * arch/x86/vmx.[ch] and arch/x86/vmx_*.[ch]: created to handle + VMX-specific events in order to provide virtual machine. + + * arch/x86/x86_32/entry.S: new code path was added to have the + first-level handler from VM exits. The first-level handler calls + the second-level handler in arch/x86/vmx.c. + + * arch/x86/setup.c: new function start_vmx() to init_intel() to + enable VMX mode. + + * include/asm-x86/config.h: #ifdef CONFIG_VMX was added. + + * arch/x86/domain.c: new code patch was added to create a VMX + domain given the flag from the control panel. + + * include/public/io/ioreq.h: A new data structure was added to + define the I/O requests between the Xen hypervisor and the + device models. + +Changes to the existing code: +----------------------------- + + * arch/x86/shadow.[ch]: new mode SHM_full_32 was added to support + full virtualization. The current Xen code assumes that the guest + page directory and tables have _machine_ (or host) physical page + frame numbers, and the new code allows to support _guest_ + physical page frame numbers + + * include/asm-x86/processor.h: struct arch_vmx_struct arch_vmx has + been added to the thread_struct data structure. The arch_vmx has + the addtional VMX-related CPU context. + + * arch/x86/io_apic.c: reverse mapping between vector and irq has + been added. We will revisit this code when considering MSI + support. + +--- Jun + + diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c index 04b3e12695..646bbc3aa0 100644 --- a/xen/arch/x86/domain.c +++ b/xen/arch/x86/domain.c @@ -32,6 +32,10 @@ #include #include #include +#include +#include +#include +#include #include #if !defined(CONFIG_X86_64BITMODE) @@ -158,6 +162,9 @@ void machine_restart(char * __unused) smp_send_stop(); disable_IO_APIC(); #endif +#ifdef CONFIG_VMX + stop_vmx(); +#endif if(!reboot_thru_bios) { /* rebooting needs to touch the page at absolute addr 0 */ @@ -239,6 +246,97 @@ void arch_do_createdomain(struct exec_domain *ed) } } +#ifdef CONFIG_VMX +void arch_vmx_do_resume(struct exec_domain *d) +{ + vmx_do_resume(d); + reset_stack_and_jump(vmx_asm_do_resume); +} + +void arch_vmx_do_launch(struct exec_domain *d) +{ + vmx_do_launch(d); + reset_stack_and_jump(vmx_asm_do_launch); +} + +static void monitor_mk_pagetable(struct exec_domain *ed) +{ + unsigned long mpfn; + l2_pgentry_t *mpl2e; + struct pfn_info *mpfn_info; + struct mm_struct *m = &ed->mm; + struct domain *d = ed->domain; + + mpfn_info = alloc_domheap_page(NULL); + ASSERT( mpfn_info ); + + mpfn = (unsigned long) (mpfn_info - frame_table); + mpl2e = (l2_pgentry_t *) map_domain_mem(mpfn << PAGE_SHIFT); + memset(mpl2e, 0, PAGE_SIZE); + + memcpy(&mpl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE], + &idle_pg_table[DOMAIN_ENTRIES_PER_L2_PAGETABLE], + HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t)); + + m->monitor_table = mk_pagetable(mpfn << PAGE_SHIFT); + m->shadow_mode = SHM_full_32; + + mpl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT] = + mk_l2_pgentry((__pa(d->mm_perdomain_pt) & PAGE_MASK) + | __PAGE_HYPERVISOR); + + unmap_domain_mem(mpl2e); +} + +static int vmx_final_setup_guestos(struct exec_domain *d, + full_execution_context_t *full_context) +{ + int error; + execution_context_t *context; + struct vmcs_struct *vmcs; + unsigned long guest_pa; + + context = &full_context->cpu_ctxt; + + /* + * Create a new VMCS + */ + if (!(vmcs = alloc_vmcs())) { + printk("Failed to create a new VMCS\n"); + return -ENOMEM; + } + + memset(&d->thread.arch_vmx, 0, sizeof (struct arch_vmx_struct)); + + d->thread.arch_vmx.vmcs = vmcs; + error = construct_vmcs(&d->thread.arch_vmx, context, full_context, VMCS_USE_HOST_ENV); + if (error < 0) { + printk("Failed to construct a new VMCS\n"); + goto out; + } + + monitor_mk_pagetable(d); + guest_pa = pagetable_val(d->mm.pagetable); + clear_bit(VMX_CPU_STATE_PG_ENABLED, &d->thread.arch_vmx.cpu_state); + + d->thread.arch_vmx.vmx_platform.real_mode_data = + (unsigned long *) context->esi; + + memset(&d->domain->shared_info->evtchn_mask[0], 0xff, + sizeof(d->domain->shared_info->evtchn_mask)); + clear_bit(IOPACKET_PORT, &d->domain->shared_info->evtchn_mask[0]); + + d->thread.schedule_tail = arch_vmx_do_launch; + + return 0; + +out: + free_vmcs(vmcs); + d->thread.arch_vmx.vmcs = 0; + return error; +} +#endif + int arch_final_setup_guestos(struct exec_domain *d, full_execution_context_t *c) { unsigned long phys_basetab; @@ -310,6 +408,11 @@ int arch_final_setup_guestos(struct exec_domain *d, full_execution_context_t *c) } } +#ifdef CONFIG_VMX + if (c->flags & ECF_VMX_GUEST) + return vmx_final_setup_guestos(d, c); +#endif + return 0; } @@ -356,7 +459,8 @@ void switch_to(struct exec_domain *prev_p, struct exec_domain *next_p) struct tss_struct *tss = init_tss + smp_processor_id(); execution_context_t *stack_ec = get_execution_context(); int i; - + unsigned long vmx_domain = next_p->thread.arch_vmx.flags; + __cli(); /* Switch guest general-register state. */ @@ -375,12 +479,6 @@ void switch_to(struct exec_domain *prev_p, struct exec_domain *next_p) &next_p->thread.user_ctxt, sizeof(*stack_ec)); - SET_FAST_TRAP(&next_p->thread); - - /* Switch the guest OS ring-1 stack. */ - tss->esp1 = next->guestos_sp; - tss->ss1 = next->guestos_ss; - /* Maybe switch the debug registers. */ if ( unlikely(next->debugreg[7]) ) { @@ -393,6 +491,24 @@ void switch_to(struct exec_domain *prev_p, struct exec_domain *next_p) loaddebug(next, 7); } + if (vmx_domain) { + /* Switch page tables. */ + write_ptbase(&next_p->mm); + + set_current(next_p); + /* Switch GDT and LDT. */ + __asm__ __volatile__ ("lgdt %0" : "=m" (*next_p->mm.gdt)); + + __sti(); + return; + } + + SET_FAST_TRAP(&next_p->thread); + + /* Switch the guest OS ring-1 stack. */ + tss->esp1 = next->guestos_sp; + tss->ss1 = next->guestos_ss; + /* Switch page tables. */ write_ptbase(&next_p->mm); } diff --git a/xen/arch/x86/io_apic.c b/xen/arch/x86/io_apic.c index 5ab74351b8..6d1593daff 100644 --- a/xen/arch/x86/io_apic.c +++ b/xen/arch/x86/io_apic.c @@ -615,6 +615,10 @@ static inline int IO_APIC_irq_trigger(int irq) int irq_vector[NR_IRQS] = { FIRST_DEVICE_VECTOR , 0 }; +#ifdef CONFIG_VMX +int vector_irq[256]; +#endif + static int __init assign_irq_vector(int irq) { static int current_vector = FIRST_DEVICE_VECTOR, offset = 0; @@ -637,6 +641,10 @@ next: panic("ran out of interrupt sources!"); IO_APIC_VECTOR(irq) = current_vector; +#ifdef CONFIG_VMX + vector_irq[current_vector] = irq; + printk("vector_irq[%x] = %d\n", current_vector, irq); +#endif return current_vector; } diff --git a/xen/arch/x86/setup.c b/xen/arch/x86/setup.c index cb8cb31a50..3d0f4ea121 100644 --- a/xen/arch/x86/setup.c +++ b/xen/arch/x86/setup.c @@ -146,6 +146,11 @@ static void __init init_intel(struct cpuinfo_x86 *c) } } #endif + +#ifdef CONFIG_VMX + start_vmx(); +#endif + } static void __init init_amd(struct cpuinfo_x86 *c) diff --git a/xen/arch/x86/shadow.c b/xen/arch/x86/shadow.c index ec08e653af..2485376472 100644 --- a/xen/arch/x86/shadow.c +++ b/xen/arch/x86/shadow.c @@ -120,7 +120,10 @@ static inline int clear_shadow_page( /* We clear L2 pages by zeroing the guest entries. */ case PGT_l2_page_table: p = map_domain_mem((spage - frame_table) << PAGE_SHIFT); - memset(p, 0, DOMAIN_ENTRIES_PER_L2_PAGETABLE * sizeof(*p)); + if (m->shadow_mode == SHM_full_32) + memset(p, 0, ENTRIES_PER_L2_PAGETABLE * sizeof(*p)); + else + memset(p, 0, DOMAIN_ENTRIES_PER_L2_PAGETABLE * sizeof(*p)); unmap_domain_mem(p); break; @@ -433,12 +436,24 @@ void unshadow_table(unsigned long gpfn, unsigned int type) free_shadow_page(&d->exec_domain[0]->mm, &frame_table[spfn]); } +#ifdef CONFIG_VMX +void vmx_shadow_clear_state(struct mm_struct *m) +{ + SH_VVLOG("vmx_clear_shadow_state: \n"); + clear_shadow_state(m); +} +#endif + + unsigned long shadow_l2_table( struct mm_struct *m, unsigned long gpfn) { struct pfn_info *spfn_info; unsigned long spfn; - l2_pgentry_t *spl2e; + l2_pgentry_t *spl2e = 0, *gpl2e; + unsigned long guest_gpfn; + + __get_machine_to_phys(m, guest_gpfn, gpfn); SH_VVLOG("shadow_l2_table( %08lx )", gpfn); @@ -451,33 +466,41 @@ unsigned long shadow_l2_table( perfc_incr(shadow_l2_pages); spfn = spfn_info - frame_table; - - /* Mark pfn as being shadowed; update field to point at shadow. */ - set_shadow_status(m, gpfn, spfn | PSH_shadowed); + /* Mark pfn as being shadowed; update field to point at shadow. */ + set_shadow_status(m, guest_gpfn, spfn | PSH_shadowed); - spl2e = (l2_pgentry_t *)map_domain_mem(spfn << PAGE_SHIFT); - - /* - * We could proactively fill in PDEs for pages that are already shadowed. - * However, we tried it and it didn't help performance. This is simpler. - */ - memset(spl2e, 0, DOMAIN_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t)); - #ifdef __i386__ /* Install hypervisor and 2x linear p.t. mapings. */ - memcpy(&spl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE], - &idle_pg_table[DOMAIN_ENTRIES_PER_L2_PAGETABLE], - HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t)); - spl2e[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT] = - mk_l2_pgentry((gpfn << PAGE_SHIFT) | __PAGE_HYPERVISOR); - spl2e[SH_LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT] = - mk_l2_pgentry((spfn << PAGE_SHIFT) | __PAGE_HYPERVISOR); - spl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT] = - mk_l2_pgentry(__pa(frame_table[gpfn].u.inuse.domain->mm_perdomain_pt) | - __PAGE_HYPERVISOR); + if (m->shadow_mode == SHM_full_32) + vmx_update_shadow_state(m, gpfn, spfn); + else { + spl2e = (l2_pgentry_t *)map_domain_mem(spfn << PAGE_SHIFT); + // can't use the linear map as we may not be in the right PT + gpl2e = (l2_pgentry_t *) map_domain_mem(gpfn << PAGE_SHIFT); + /* + * We could proactively fill in PDEs for pages that are already shadowed. + * However, we tried it and it didn't help performance. This is simpler. + */ + memset(spl2e, 0, DOMAIN_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t)); + + /* Install hypervisor and 2x linear p.t. mapings. */ + memcpy(&spl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE], + &idle_pg_table[DOMAIN_ENTRIES_PER_L2_PAGETABLE], + HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t)); + spl2e[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT] = + mk_l2_pgentry((gpfn << PAGE_SHIFT) | __PAGE_HYPERVISOR); + spl2e[SH_LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT] = + mk_l2_pgentry((spfn << PAGE_SHIFT) | __PAGE_HYPERVISOR); + spl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT] = + mk_l2_pgentry(__pa(frame_table[gpfn].u.inuse.domain->mm_perdomain_pt) | + __PAGE_HYPERVISOR); + } #endif - unmap_domain_mem(spl2e); + if (m->shadow_mode != SHM_full_32) + { + unmap_domain_mem(spl2e); + } SH_VLOG("shadow_l2_table( %08lx -> %08lx)", gpfn, spfn); return spfn; @@ -486,13 +509,13 @@ unsigned long shadow_l2_table( static void shadow_map_l1_into_current_l2(unsigned long va) { struct mm_struct *m = ¤t->mm; - unsigned long *gpl1e, *spl1e, gpde, spde, gl1pfn, sl1pfn, sl1ss; + unsigned long *gpl1e, *spl1e, gpl2e, spl2e, gl1pfn, sl1pfn=0, sl1ss; struct pfn_info *sl1pfn_info; int i; - gpde = l2_pgentry_val(linear_l2_table[va >> L2_PAGETABLE_SHIFT]); + __guest_get_pl2e(m, va, &gpl2e); - gl1pfn = gpde >> PAGE_SHIFT; + gl1pfn = gpl2e >> PAGE_SHIFT; sl1ss = __shadow_status(m, gl1pfn); if ( !(sl1ss & PSH_shadowed) ) @@ -510,11 +533,10 @@ static void shadow_map_l1_into_current_l2(unsigned long va) set_shadow_status(m, gl1pfn, PSH_shadowed | sl1pfn); - l2pde_general(m, &gpde, &spde, sl1pfn); + l2pde_general(m, &gpl2e, &spl2e, sl1pfn); - linear_l2_table[va>>L2_PAGETABLE_SHIFT] = mk_l2_pgentry(gpde); - shadow_linear_l2_table[va>>L2_PAGETABLE_SHIFT] = - mk_l2_pgentry(spde); + __guest_set_pl2e(m, va, gpl2e); + __shadow_set_pl2e(m, va, spl2e); gpl1e = (unsigned long *) &(linear_pg_table[ (va>>PAGE_SHIFT) & ~(ENTRIES_PER_L1_PAGETABLE-1)]); @@ -531,13 +553,38 @@ static void shadow_map_l1_into_current_l2(unsigned long va) SH_VVLOG("4b: was shadowed, l2 missing ( %08lx )", sl1pfn); sl1pfn = sl1ss & PSH_pfn_mask; - l2pde_general(m, &gpde, &spde, sl1pfn); - - linear_l2_table[va >> L2_PAGETABLE_SHIFT] = mk_l2_pgentry(gpde); - shadow_linear_l2_table[va >> L2_PAGETABLE_SHIFT] = mk_l2_pgentry(spde); + l2pde_general(m, &gpl2e, &spl2e, sl1pfn); + __guest_set_pl2e(m, va, gpl2e); + __shadow_set_pl2e(m, va, spl2e); } } +#ifdef CONFIG_VMX +void vmx_shadow_invlpg(struct mm_struct *m, unsigned long va) +{ + unsigned long gpte, spte, host_pfn; + + if (__put_user(0L, (unsigned long *) + &shadow_linear_pg_table[va >> PAGE_SHIFT])) { + vmx_shadow_clear_state(m); + return; + } + + if (__get_user(gpte, (unsigned long *) + &linear_pg_table[va >> PAGE_SHIFT])) { + return; + } + + host_pfn = phys_to_machine_mapping[gpte >> PAGE_SHIFT]; + spte = (host_pfn << PAGE_SHIFT) | (gpte & ~PAGE_MASK); + + if (__put_user(spte, (unsigned long *) + &shadow_linear_pg_table[va >> PAGE_SHIFT])) { + return; + } +} +#endif + int shadow_fault(unsigned long va, long error_code) { unsigned long gpte, spte; @@ -718,6 +765,9 @@ static int check_pte( int level, int i) { unsigned long mask, gpfn, spfn; +#ifdef CONFIG_VMX + unsigned long guest_gpfn; +#endif if ( (spte == 0) || (spte == 0xdeadface) || (spte == 0x00000E00) ) return 1; /* always safe */ @@ -761,8 +811,20 @@ static int check_pte( if ( level < 2 ) FAIL("Shadow in L1 entry?"); - if ( __shadow_status(m, gpfn) != (PSH_shadowed | spfn) ) - FAIL("spfn problem g.sf=%08lx", __shadow_status(m, gpfn)); + if (m->shadow_mode == SHM_full_32) { + + guest_gpfn = phys_to_machine_mapping[gpfn]; + + if ( __shadow_status(m, guest_gpfn) != (PSH_shadowed | spfn) ) + FAIL("spfn problem g.sf=%08lx", + __shadow_status(m, guest_gpfn) ); + + } else { + if ( __shadow_status(m, gpfn) != (PSH_shadowed | spfn) ) + FAIL("spfn problem g.sf=%08lx", + __shadow_status(m, gpfn) ); + } + } return 1; @@ -800,6 +862,7 @@ int check_pagetable(struct mm_struct *m, pagetable_t pt, char *s) unsigned long gpfn, spfn; int i; l2_pgentry_t *gpl2e, *spl2e; + unsigned long host_gpfn = 0; sh_check_name = s; @@ -809,20 +872,29 @@ int check_pagetable(struct mm_struct *m, pagetable_t pt, char *s) gpfn = gptbase >> PAGE_SHIFT; - if ( !(__shadow_status(m, gpfn) & PSH_shadowed) ) + __get_phys_to_machine(m, host_gpfn, gpfn); + + if ( ! (__shadow_status(m, gpfn) & PSH_shadowed) ) { printk("%s-PT %08lx not shadowed\n", s, gptbase); - if ( __shadow_status(m, gpfn) != 0 ) - BUG(); - return 0; - } + + if( __shadow_status(m, gpfn) != 0 ) BUG(); + return 0; + } spfn = __shadow_status(m, gpfn) & PSH_pfn_mask; - if ( __shadow_status(m, gpfn) != (PSH_shadowed | spfn) ) - FAILPT("ptbase shadow inconsistent1"); + if ( ! __shadow_status(m, gpfn) == (PSH_shadowed | spfn) ) + FAILPT("ptbase shadow inconsistent1"); + + if (m->shadow_mode == SHM_full_32) + { + host_gpfn = phys_to_machine_mapping[gpfn]; + gpl2e = (l2_pgentry_t *) map_domain_mem( host_gpfn << PAGE_SHIFT ); + + } else + gpl2e = (l2_pgentry_t *) map_domain_mem( gpfn << PAGE_SHIFT ); - gpl2e = (l2_pgentry_t *) map_domain_mem( gpfn << PAGE_SHIFT ); spl2e = (l2_pgentry_t *) map_domain_mem( spfn << PAGE_SHIFT ); if ( memcmp(&spl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE], @@ -830,7 +902,6 @@ int check_pagetable(struct mm_struct *m, pagetable_t pt, char *s) ((SH_LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT) - DOMAIN_ENTRIES_PER_L2_PAGETABLE) * sizeof(l2_pgentry_t)) ) { - printk("gpfn=%08lx spfn=%08lx\n", gpfn, spfn); for ( i = DOMAIN_ENTRIES_PER_L2_PAGETABLE; i < (SH_LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT); i++ ) @@ -851,11 +922,12 @@ int check_pagetable(struct mm_struct *m, pagetable_t pt, char *s) L2_PAGETABLE_SHIFT]), (spfn << PAGE_SHIFT) | __PAGE_HYPERVISOR); - if ( (l2_pgentry_val(spl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT]) != - ((__pa(frame_table[gpfn].u.inuse.domain->mm.perdomain_pt) | + if (m->shadow_mode != SHM_full_32) { + if ( (l2_pgentry_val(spl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT]) != + ((__pa(frame_table[gpfn].u.inuse.domain->mm.perdomain_pt) | __PAGE_HYPERVISOR))) ) - FAILPT("hypervisor per-domain map inconsistent"); - + FAILPT("hypervisor per-domain map inconsistent"); + } /* Check the whole L2. */ for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ ) diff --git a/xen/arch/x86/time.c b/xen/arch/x86/time.c index cc9ef381b9..3ff5e7f135 100644 --- a/xen/arch/x86/time.c +++ b/xen/arch/x86/time.c @@ -50,7 +50,7 @@ static s_time_t stime_irq; /* System time at last 'time update' */ static unsigned long wc_sec, wc_usec; /* UTC time at last 'time update'. */ static rwlock_t time_lock = RW_LOCK_UNLOCKED; -static void timer_interrupt(int irq, void *dev_id, struct xen_regs *regs) +void timer_interrupt(int irq, void *dev_id, struct xen_regs *regs) { write_lock_irq(&time_lock); diff --git a/xen/arch/x86/vmx.c b/xen/arch/x86/vmx.c new file mode 100644 index 0000000000..f42c596592 --- /dev/null +++ b/xen/arch/x86/vmx.c @@ -0,0 +1,913 @@ +/* + * vmx.c: handling VMX architecture-related VM exits + * Copyright (c) 2004, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +int vmcs_size; +unsigned int opt_vmx_debug_level; + +int start_vmx() +{ + struct vmcs_struct *vmcs; + unsigned long ecx; + u64 phys_vmcs; /* debugging */ + + vmcs_size = VMCS_SIZE; + /* + * Xen does not fill x86_capability words except 0. + */ + ecx = cpuid_ecx(1); + boot_cpu_data.x86_capability[4] = ecx; + + if (!(test_bit(X86_FEATURE_VMXE, &boot_cpu_data.x86_capability))) + return 0; + + set_in_cr4(X86_CR4_VMXE); /* Enable VMXE */ + + if (!(vmcs = alloc_vmcs())) { + printk("Failed to allocate VMCS\n"); + return 0; + } + + phys_vmcs = (u64) virt_to_phys(vmcs); + + if (!(__vmxon(phys_vmcs))) { + printk("VMXON is done\n"); + } + + return 1; +} + +void stop_vmx() +{ + if (test_bit(X86_FEATURE_VMXE, &boot_cpu_data.x86_capability)) + __vmxoff(); +} + +/* + * Not all cases recevie valid value in the VM-exit instruction length field. + */ +#define __get_instruction_length(len) \ + __vmread(INSTRUCTION_LEN, &(len)); \ + if ((len) < 1 || (len) > 15) \ + __vmx_bug(®s); + +static void inline __update_guest_eip(unsigned long inst_len) +{ + unsigned long current_eip; + + __vmread(GUEST_EIP, ¤t_eip); + __vmwrite(GUEST_EIP, current_eip + inst_len); +} + + +#include + +static int vmx_do_page_fault(unsigned long va, unsigned long error_code) +{ + unsigned long eip, pfn; + unsigned int index; + unsigned long gpde = 0; + int result; + struct exec_domain *ed = current; + struct mm_struct *m = &ed->mm; + +#if VMX_DEBUG + { + __vmread(GUEST_EIP, &eip); + VMX_DBG_LOG(DBG_LEVEL_VMMU, + "vmx_do_page_fault = 0x%lx, eip = %lx, erro_code = %lx\n", + va, eip, error_code); + } +#endif + /* + * Set up guest page directory cache to make linear_pt_table[] work. + */ + __guest_get_pl2e(m, va, &gpde); + if (!(gpde & _PAGE_PRESENT)) + return 0; + + index = (va >> L2_PAGETABLE_SHIFT); + if (!l2_pgentry_val(m->guest_pl2e_cache[index])) { + pfn = phys_to_machine_mapping[gpde >> PAGE_SHIFT]; + + VMX_DBG_LOG(DBG_LEVEL_VMMU, "vmx_do_page_fault: pagetable = %lx\n", + pagetable_val(m->pagetable)); + + m->guest_pl2e_cache[index] = + mk_l2_pgentry((pfn << PAGE_SHIFT) | __PAGE_HYPERVISOR); + } + + if ((result = shadow_fault(va, error_code))) + return result; + + return 0; /* failed to resolve, i.e raise #PG */ +} + +static void vmx_do_general_protection_fault(struct xen_regs *regs) +{ + unsigned long eip, error_code; + + __vmread(GUEST_EIP, &eip); + __vmread(VM_EXIT_INTR_ERROR_CODE, &error_code); + + VMX_DBG_LOG(DBG_LEVEL_1, + "vmx_general_protection_fault: eip = %lx, erro_code = %lx\n", + eip, error_code); + + VMX_DBG_LOG(DBG_LEVEL_1, + "eax=%x, ebx=%x, ecx=%x, edx=%x, esi=%x, edi=%x\n", + regs->eax, regs->ebx, regs->ecx, regs->edx, regs->esi, regs->edi); + + __vmx_bug(regs); +} + +static void vmx_vmexit_do_cpuid(unsigned long input, struct xen_regs *regs) +{ + int eax, ebx, ecx, edx; + unsigned long eip; + + __vmread(GUEST_EIP, &eip); + + VMX_DBG_LOG(DBG_LEVEL_1, + "do_cpuid: (eax) %x, (ebx) %x, (ecx) %x, (edx) %x, (esi) %x, (edi) %x\n", regs->eax, regs->ebx, regs->ecx, regs->edx, regs->esi, regs->edi); + + cpuid(input, &eax, &ebx, &ecx, &edx); + + if (input == 1) { + clear_bit(X86_FEATURE_PSE, &edx); + clear_bit(X86_FEATURE_PAE, &edx); + clear_bit(X86_FEATURE_PSE36, &edx); + } + + regs->eax = (unsigned long) eax; + regs->ebx = (unsigned long) ebx; + regs->ecx = (unsigned long) ecx; + regs->edx = (unsigned long) edx; + + VMX_DBG_LOG(DBG_LEVEL_1, + "vmx_vmexit_do_cpuid: eip: %lx, input: %lx, out:eax=%x, ebx=%x, ecx=%x, edx=%x\n", + eip, input, eax, ebx, ecx, edx); + +} + +#define CASE_GET_REG_P(REG, reg) \ + case REG_ ## REG: reg_p = &(regs->reg); break + +static void vmx_dr_access (unsigned long exit_qualification, struct xen_regs *regs) +{ + unsigned int reg; + u32 *reg_p = 0; + struct exec_domain *ed = current; + u32 eip; + + __vmread(GUEST_EIP, &eip); + + reg = exit_qualification & DEBUG_REG_ACCESS_NUM; + + VMX_DBG_LOG(DBG_LEVEL_1, + "vmx_dr_access : eip=%08x, reg=%d, exit_qualification = %lx\n", + eip, reg, exit_qualification); + + switch(exit_qualification & DEBUG_REG_ACCESS_REG) { + CASE_GET_REG_P(EAX, eax); + CASE_GET_REG_P(ECX, ecx); + CASE_GET_REG_P(EDX, edx); + CASE_GET_REG_P(EBX, ebx); + CASE_GET_REG_P(EBP, ebp); + CASE_GET_REG_P(ESI, esi); + CASE_GET_REG_P(EDI, edi); + case REG_ESP: + break; + default: + __vmx_bug(regs); + } + + switch (exit_qualification & DEBUG_REG_ACCESS_TYPE) { + case TYPE_MOV_TO_DR: + /* don't need to check the range */ + if (reg != REG_ESP) + ed->thread.debugreg[reg] = *reg_p; + else { + unsigned long value; + __vmread(GUEST_ESP, &value); + ed->thread.debugreg[reg] = value; + } + break; + case TYPE_MOV_FROM_DR: + if (reg != REG_ESP) + *reg_p = ed->thread.debugreg[reg]; + else { + __vmwrite(GUEST_ESP, ed->thread.debugreg[reg]); + } + break; + } +} + +/* + * Invalidate the TLB for va. Invalidate the shadow page corresponding + * the address va. + */ +static void vmx_vmexit_do_invlpg(unsigned long va) +{ + unsigned long eip; + struct exec_domain *d = current; + unsigned int index; + + __vmread(GUEST_EIP, &eip); + + VMX_DBG_LOG(DBG_LEVEL_VMMU, "vmx_vmexit_do_invlpg:eip=%08lx, va=%08lx\n", + eip, va); + + /* + * We do the safest things first, then try to update the shadow + * copying from guest + */ + vmx_shadow_invlpg(&d->mm, va); + index = (va >> L2_PAGETABLE_SHIFT); + d->mm.guest_pl2e_cache[index] = mk_l2_pgentry(0); /* invalidate pgd cache */ +} + +static inline void guest_pl2e_cache_invalidate(struct mm_struct *m) +{ + /* + * Need to optimize this + */ + memset(m->guest_pl2e_cache, 0, PAGE_SIZE); +} + +static inline unsigned long gva_to_gpa(unsigned long gva) +{ + unsigned long gpde, gpte, pfn, index; + struct exec_domain *d = current; + struct mm_struct *m = &d->mm; + + __guest_get_pl2e(m, gva, &gpde); + index = (gva >> L2_PAGETABLE_SHIFT); + + pfn = phys_to_machine_mapping[gpde >> PAGE_SHIFT]; + + m->guest_pl2e_cache[index] = + mk_l2_pgentry((pfn << PAGE_SHIFT) | __PAGE_HYPERVISOR); + + if ( unlikely(__get_user(gpte, (unsigned long *) + &linear_pg_table[gva >> PAGE_SHIFT])) ) + { + printk("gva_to_gpa EXIT: read gpte faulted" ); + return 0; + } + + if ( !(gpte & _PAGE_PRESENT) ) + { + printk("gva_to_gpa - EXIT: gpte not present (%lx)",gpte ); + return 0; + } + + return (gpte & PAGE_MASK) + (gva & ~PAGE_MASK); +} + +static void vmx_io_instruction(struct xen_regs *regs, + unsigned long exit_qualification, unsigned long inst_len) +{ + struct exec_domain *d = current; + vcpu_iodata_t *vio; + ioreq_t *p; + unsigned long addr; + unsigned long eip; + + extern long evtchn_send(int lport); + extern long do_block(void); + + __vmread(GUEST_EIP, &eip); + + VMX_DBG_LOG(DBG_LEVEL_1, + "vmx_io_instruction: eip=%08lx, exit_qualification = %lx\n", + eip, exit_qualification); + + if (test_bit(6, &exit_qualification)) + addr = (exit_qualification >> 16) & (0xffff); + else + addr = regs->edx & 0xffff; + + if (addr == 0x80) { + __update_guest_eip(inst_len); + return; + } + + vio = (vcpu_iodata_t *) d->thread.arch_vmx.vmx_platform.shared_page_va; + if (vio == 0) { + VMX_DBG_LOG(DBG_LEVEL_1, "bad shared page: %lx\n", (unsigned long) vio); + domain_crash(); + } + p = &vio->vp_ioreq; + p->dir = test_bit(3, &exit_qualification); + set_bit(ARCH_VMX_IO_WAIT, &d->thread.arch_vmx.flags); + + p->pdata_valid = 0; + p->count = 1; + p->size = (exit_qualification & 7) + 1; + + if (test_bit(4, &exit_qualification)) { + p->pdata_valid = 1; + p->u.pdata = (void *) ((p->dir == IOREQ_WRITE) ? + regs->esi + : regs->edi); + p->u.pdata = (void *) gva_to_gpa(p->u.data); + if (test_bit(5, &exit_qualification)) + p->count = regs->ecx; + if ((p->u.data & PAGE_MASK) != + ((p->u.data + p->count * p->size - 1) & PAGE_MASK)) { + printk("stringio crosses page boundary!\n"); + if (p->u.data & (p->size - 1)) { + printk("Not aligned I/O!\n"); + domain_crash(); + } + p->count = (PAGE_SIZE - (p->u.data & ~PAGE_MASK)) / p->size; + } else { + __update_guest_eip(inst_len); + } + } else if (p->dir == IOREQ_WRITE) { + p->u.data = regs->eax; + __update_guest_eip(inst_len); + } else + __update_guest_eip(inst_len); + + p->addr = addr; + p->port_mm = 0; + p->state = STATE_IOREQ_READY; + evtchn_send(IOPACKET_PORT); + do_block(); +} + +#define CASE_GET_REG(REG, reg) \ + case REG_ ## REG: value = regs->reg; break + +/* + * Write to control registers + */ +static void mov_to_cr(int gp, int cr, struct xen_regs *regs) +{ + unsigned long value; + unsigned long old_cr; + struct exec_domain *d = current; + + switch (gp) { + CASE_GET_REG(EAX, eax); + CASE_GET_REG(ECX, ecx); + CASE_GET_REG(EDX, edx); + CASE_GET_REG(EBX, ebx); + CASE_GET_REG(EBP, ebp); + CASE_GET_REG(ESI, esi); + CASE_GET_REG(EDI, edi); + case REG_ESP: + __vmread(GUEST_ESP, &value); + break; + default: + printk("invalid gp: %d\n", gp); + __vmx_bug(regs); + } + + VMX_DBG_LOG(DBG_LEVEL_1, "mov_to_cr: CR%d, value = %lx, \n", cr, value); + VMX_DBG_LOG(DBG_LEVEL_1, "current = %lx, \n", (unsigned long) current); + + switch(cr) { + case 0: + { + unsigned long old_base_pfn = 0, pfn; + + /* + * CR0: + * We don't want to lose PE and PG. + */ + __vmwrite(GUEST_CR0, (value | X86_CR0_PE | X86_CR0_PG)); + __vmwrite(CR0_READ_SHADOW, value); + + if (value & (X86_CR0_PE | X86_CR0_PG) && + !test_bit(VMX_CPU_STATE_PG_ENABLED, &d->thread.arch_vmx.cpu_state)) { + /* + * Enable paging + */ + set_bit(VMX_CPU_STATE_PG_ENABLED, &d->thread.arch_vmx.cpu_state); + /* + * The guest CR3 must be pointing to the guest physical. + */ + if (!(pfn = phys_to_machine_mapping[ + d->thread.arch_vmx.cpu_cr3 >> PAGE_SHIFT])) + { + VMX_DBG_LOG(DBG_LEVEL_VMMU, "Invalid CR3 value = %lx\n", + d->thread.arch_vmx.cpu_cr3); + domain_crash(); /* need to take a clean path */ + } + old_base_pfn = pagetable_val(d->mm.pagetable) >> PAGE_SHIFT; + /* + * Now mm.pagetable points to machine physical. + */ + d->mm.pagetable = mk_pagetable(pfn << PAGE_SHIFT); + + VMX_DBG_LOG(DBG_LEVEL_VMMU, "New mm.pagetable = %lx\n", + (unsigned long) (pfn << PAGE_SHIFT)); + + shadow_lock(&d->mm); + shadow_mode_enable(d->domain, SHM_full_32); + shadow_unlock(&d->mm); + + __vmwrite(GUEST_CR3, pagetable_val(d->mm.shadow_table)); + /* + * mm->shadow_table should hold the next CR3 for shadow + */ + VMX_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx, pfn = %lx\n", + d->thread.arch_vmx.cpu_cr3, pfn); + put_page_and_type(&frame_table[old_base_pfn]); + + } + break; + } + case 3: + { + unsigned long pfn; + + /* + * If paging is not enabled yet, simply copy the valut to CR3. + */ + if (!test_bit(VMX_CPU_STATE_PG_ENABLED, &d->thread.arch_vmx.cpu_state)) { + d->thread.arch_vmx.cpu_cr3 = value; + return; + } + + guest_pl2e_cache_invalidate(&d->mm); + /* + * We make a new one if the shadow does not exist. + */ + if (value == d->thread.arch_vmx.cpu_cr3) { + /* + * This is simple TLB flush, implying the guest has + * removed some translation or changed page attributes. + * We simply invalidate the shadow. + */ + pfn = phys_to_machine_mapping[value >> PAGE_SHIFT]; + if ((pfn << PAGE_SHIFT) != pagetable_val(d->mm.pagetable)) + __vmx_bug(regs); + vmx_shadow_clear_state(&d->mm); + shadow_invalidate(&d->mm); + } else { + /* + * If different, make a shadow. Check if the PDBR is valid + * first. + */ + VMX_DBG_LOG(DBG_LEVEL_VMMU, "CR3 value = %lx\n", value); + if ((value >> PAGE_SHIFT) > d->domain->max_pages) + { + VMX_DBG_LOG(DBG_LEVEL_VMMU, + "Invalid CR3 value=%lx\n", value); + domain_crash(); /* need to take a clean path */ + } + pfn = phys_to_machine_mapping[value >> PAGE_SHIFT]; + vmx_shadow_clear_state(&d->mm); + d->mm.pagetable = mk_pagetable(pfn << PAGE_SHIFT); + shadow_mk_pagetable(&d->mm); + /* + * mm->shadow_table should hold the next CR3 for shadow + */ + d->thread.arch_vmx.cpu_cr3 = value; + VMX_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx\n", + value); + __vmwrite(GUEST_CR3, pagetable_val(d->mm.shadow_table)); + } + break; + } + case 4: + /* CR4 */ + if (value & X86_CR4_PAE) + __vmx_bug(regs); /* not implemented */ + __vmread(CR4_READ_SHADOW, &old_cr); + + __vmwrite(GUEST_CR4, (value | X86_CR4_VMXE)); + __vmwrite(CR4_READ_SHADOW, value); + + /* + * Writing to CR4 to modify the PSE, PGE, or PAE flag invalidates + * all TLB entries except global entries. + */ + if ((old_cr ^ value) & (X86_CR4_PSE | X86_CR4_PGE | X86_CR4_PAE)) { + vmx_shadow_clear_state(&d->mm); + shadow_invalidate(&d->mm); + guest_pl2e_cache_invalidate(&d->mm); + } + break; + default: + printk("invalid cr: %d\n", gp); + __vmx_bug(regs); + } +} + +#define CASE_SET_REG(REG, reg) \ + case REG_ ## REG: \ + regs->reg = value; \ + break + +/* + * Read from control registers. CR0 and CR4 are read from the shadow. + */ +static void mov_from_cr(int cr, int gp, struct xen_regs *regs) +{ + unsigned long value; + struct exec_domain *d = current; + + if (cr != 3) + __vmx_bug(regs); + + value = (unsigned long) d->thread.arch_vmx.cpu_cr3; + ASSERT(value); + + switch (gp) { + CASE_SET_REG(EAX, eax); + CASE_SET_REG(ECX, ecx); + CASE_SET_REG(EDX, edx); + CASE_SET_REG(EBX, ebx); + CASE_SET_REG(EBP, ebp); + CASE_SET_REG(ESI, esi); + CASE_SET_REG(EDI, edi); + case REG_ESP: + __vmwrite(GUEST_ESP, value); + regs->esp = value; + break; + default: + printk("invalid gp: %d\n", gp); + __vmx_bug(regs); + } + + VMX_DBG_LOG(DBG_LEVEL_VMMU, "mov_from_cr: CR%d, value = %lx, \n", cr, value); +} + +static void vmx_cr_access (unsigned long exit_qualification, struct xen_regs *regs) +{ + unsigned int gp, cr; + unsigned long value; + + switch (exit_qualification & CONTROL_REG_ACCESS_TYPE) { + case TYPE_MOV_TO_CR: + gp = exit_qualification & CONTROL_REG_ACCESS_REG; + cr = exit_qualification & CONTROL_REG_ACCESS_NUM; + mov_to_cr(gp, cr, regs); + break; + case TYPE_MOV_FROM_CR: + gp = exit_qualification & CONTROL_REG_ACCESS_REG; + cr = exit_qualification & CONTROL_REG_ACCESS_NUM; + mov_from_cr(cr, gp, regs); + break; + case TYPE_CLTS: + __vmread(GUEST_CR0, &value); + value &= ~X86_CR0_TS; /* clear TS */ + __vmwrite(GUEST_CR0, value); + + __vmread(CR0_READ_SHADOW, &value); + value &= ~X86_CR0_TS; /* clear TS */ + __vmwrite(CR0_READ_SHADOW, value); + break; + default: + __vmx_bug(regs); + break; + } +} + +static inline void vmx_do_msr_read(struct xen_regs *regs) +{ + VMX_DBG_LOG(DBG_LEVEL_1, "vmx_do_msr_read: ecx=%x, eax=%x, edx=%x", + regs->ecx, regs->eax, regs->edx); + + rdmsr(regs->ecx, regs->eax, regs->edx); + + VMX_DBG_LOG(DBG_LEVEL_1, "vmx_do_msr_read returns: ecx=%x, eax=%x, edx=%x", + regs->ecx, regs->eax, regs->edx); +} + +/* + * Need to use this exit to rescheule + */ +static inline void vmx_vmexit_do_hlt() +{ + extern long do_block(void); +#if VMX_DEBUG + unsigned long eip; + __vmread(GUEST_EIP, &eip); +#endif + VMX_DBG_LOG(DBG_LEVEL_1, "vmx_vmexit_do_hlt:eip=%08lx\n", eip); + __enter_scheduler(); +} + +static inline void vmx_vmexit_do_mwait() +{ +#if VMX_DEBUG + unsigned long eip; + __vmread(GUEST_EIP, &eip); +#endif + VMX_DBG_LOG(DBG_LEVEL_1, "vmx_vmexit_do_mwait:eip=%08lx\n", eip); + __enter_scheduler(); +} + +#define BUF_SIZ 256 +#define MAX_LINE 80 +char print_buf[BUF_SIZ]; +static int index; + +static void vmx_print_line(const char c, struct exec_domain *d) +{ + + if (index == MAX_LINE || c == '\n') { + if (index == MAX_LINE) { + print_buf[index++] = c; + } + print_buf[index] = '\0'; + printk("(GUEST: %u) %s\n", d->domain->id, (char *) &print_buf); + index = 0; + } + else + print_buf[index++] = c; +} + +#ifdef XEN_DEBUGGER +void save_xen_regs(struct xen_regs *regs) +{ + __vmread(GUEST_SS_SELECTOR, ®s->xss); + __vmread(GUEST_ESP, ®s->esp); + __vmread(GUEST_EFLAGS, ®s->eflags); + __vmread(GUEST_CS_SELECTOR, ®s->xcs); + __vmread(GUEST_EIP, ®s->eip); + + __vmread(GUEST_GS_SELECTOR, ®s->xgs); + __vmread(GUEST_FS_SELECTOR, ®s->xfs); + __vmread(GUEST_ES_SELECTOR, ®s->xes); + __vmread(GUEST_DS_SELECTOR, ®s->xds); +} + +void restore_xen_regs(struct xen_regs *regs) +{ + __vmwrite(GUEST_SS_SELECTOR, regs->xss); + __vmwrite(GUEST_ESP, regs->esp); + __vmwrite(GUEST_EFLAGS, regs->eflags); + __vmwrite(GUEST_CS_SELECTOR, regs->xcs); + __vmwrite(GUEST_EIP, regs->eip); + + __vmwrite(GUEST_GS_SELECTOR, regs->xgs); + __vmwrite(GUEST_FS_SELECTOR, regs->xfs); + __vmwrite(GUEST_ES_SELECTOR, regs->xes); + __vmwrite(GUEST_DS_SELECTOR, regs->xds); +} +#endif + +asmlinkage void vmx_vmexit_handler(struct xen_regs regs) +{ + unsigned int exit_reason, idtv_info_field; + unsigned long exit_qualification, eip, inst_len = 0; + struct exec_domain *d = current; + int error; + + if ((error = __vmread(VM_EXIT_REASON, &exit_reason))) + __vmx_bug(®s); + + __vmread(IDT_VECTORING_INFO_FIELD, &idtv_info_field); + if (idtv_info_field & INTR_INFO_VALID_MASK) { + __vmwrite(VM_ENTRY_INTR_INFO_FIELD, idtv_info_field); + if ((idtv_info_field & 0xff) == 14) { + unsigned long error_code; + + __vmread(VM_EXIT_INTR_ERROR_CODE, &error_code); + printk("#PG error code: %lx\n", error_code); + } + VMX_DBG_LOG(DBG_LEVEL_1, "idtv_info_field=%x\n", + idtv_info_field); + } + + /* don't bother H/W interrutps */ + if (exit_reason != EXIT_REASON_EXTERNAL_INTERRUPT && + exit_reason != EXIT_REASON_VMCALL && + exit_reason != EXIT_REASON_IO_INSTRUCTION) + VMX_DBG_LOG(DBG_LEVEL_0, "exit reason = %x\n", exit_reason); + + if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) { + __vmread(EXIT_QUALIFICATION, &exit_qualification); + __vmread(GUEST_EIP, &eip); + domain_crash(); + return; + } + + switch (exit_reason) { + case EXIT_REASON_EXCEPTION_NMI: + { +#define VECTOR_DB 1 +#define VECTOR_BP 3 +#define VECTOR_GP 13 +#define VECTOR_PG 14 + + /* + * We don't set the software-interrupt exiting (INT n). + * (1) We can get an exception (e.g. #PG) in the guest, or + * (2) NMI + */ + int error; + unsigned int vector; + unsigned long va; + unsigned long error_code; + + if ((error = __vmread(VM_EXIT_INTR_INFO, &vector)) + && !(vector & INTR_INFO_VALID_MASK)) + __vmx_bug(®s); + vector &= 0xff; + + switch (vector) { +#ifdef XEN_DEBUGGER + case VECTOR_DB: + { + save_xen_regs(®s); + pdb_handle_exception(1, ®s, 1); + restore_xen_regs(®s); + break; + } + case VECTOR_BP: + { + save_xen_regs(®s); + pdb_handle_exception(3, ®s, 1); + restore_xen_regs(®s); + break; + } +#endif + case VECTOR_GP: + { + vmx_do_general_protection_fault(®s); + break; + } + case VECTOR_PG: + { + __vmread(EXIT_QUALIFICATION, &va); + __vmread(VM_EXIT_INTR_ERROR_CODE, &error_code); + VMX_DBG_LOG(DBG_LEVEL_VMMU, + "eax=%x, ebx=%x, ecx=%x, edx=%x, esi=%x, edi=%x\n", regs.eax, regs.ebx, regs.ecx, regs.edx, regs.esi, regs.edi); + + if (!(error = vmx_do_page_fault(va, error_code))) { + /* + * Inject #PG using Interruption-Information Fields + */ + unsigned long intr_fields; + + intr_fields = (INTR_INFO_VALID_MASK | + INTR_TYPE_EXCEPTION | + INTR_INFO_DELIEVER_CODE_MASK | + VECTOR_PG); + __vmwrite(VM_ENTRY_INTR_INFO_FIELD, intr_fields); + __vmwrite(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code); + d->thread.arch_vmx.cpu_cr2 = va; + } + break; + } + default: + __vmx_bug(®s); + break; + } + break; + } + case EXIT_REASON_EXTERNAL_INTERRUPT: + { + extern int vector_irq[]; + extern asmlinkage void do_IRQ(struct xen_regs); + extern void smp_apic_timer_interrupt(struct xen_regs *); + extern void timer_interrupt(int, void *, struct xen_regs *); + unsigned int vector; + + if ((error = __vmread(VM_EXIT_INTR_INFO, &vector)) + && !(vector & INTR_INFO_VALID_MASK)) + __vmx_bug(®s); + + vector &= 0xff; + local_irq_disable(); + + if (vector == LOCAL_TIMER_VECTOR) { + smp_apic_timer_interrupt(®s); + } else { + regs.entry_vector = (vector == FIRST_DEVICE_VECTOR? + 0 : vector_irq[vector]); + do_IRQ(regs); + } + break; + } + case EXIT_REASON_PENDING_INTERRUPT: + __vmwrite(CPU_BASED_VM_EXEC_CONTROL, + MONITOR_CPU_BASED_EXEC_CONTROLS); + vmx_intr_assist(d); + break; + case EXIT_REASON_TASK_SWITCH: + __vmx_bug(®s); + break; + case EXIT_REASON_CPUID: + __get_instruction_length(inst_len); + vmx_vmexit_do_cpuid(regs.eax, ®s); + __update_guest_eip(inst_len); + break; + case EXIT_REASON_HLT: + __get_instruction_length(inst_len); + __update_guest_eip(inst_len); + vmx_vmexit_do_hlt(); + break; + case EXIT_REASON_INVLPG: + { + unsigned long va; + + __vmread(EXIT_QUALIFICATION, &va); + vmx_vmexit_do_invlpg(va); + __get_instruction_length(inst_len); + __update_guest_eip(inst_len); + break; + } + case EXIT_REASON_VMCALL: + __get_instruction_length(inst_len); + __vmread(GUEST_EIP, &eip); + __vmread(EXIT_QUALIFICATION, &exit_qualification); + + vmx_print_line(regs.eax, d); /* provides the current domain */ + __update_guest_eip(inst_len); + break; + case EXIT_REASON_CR_ACCESS: + { + __vmread(GUEST_EIP, &eip); + __get_instruction_length(inst_len); + __vmread(EXIT_QUALIFICATION, &exit_qualification); + + VMX_DBG_LOG(DBG_LEVEL_1, "eip = %lx, inst_len =%lx, exit_qualification = %lx\n", + eip, inst_len, exit_qualification); + vmx_cr_access(exit_qualification, ®s); + __update_guest_eip(inst_len); + break; + } + case EXIT_REASON_DR_ACCESS: + __vmread(EXIT_QUALIFICATION, &exit_qualification); + vmx_dr_access(exit_qualification, ®s); + __get_instruction_length(inst_len); + __update_guest_eip(inst_len); + break; + case EXIT_REASON_IO_INSTRUCTION: + __vmread(EXIT_QUALIFICATION, &exit_qualification); + __get_instruction_length(inst_len); + vmx_io_instruction(®s, exit_qualification, inst_len); + break; + case EXIT_REASON_MSR_READ: + __get_instruction_length(inst_len); + vmx_do_msr_read(®s); + __update_guest_eip(inst_len); + break; + case EXIT_REASON_MSR_WRITE: + __vmread(GUEST_EIP, &eip); + VMX_DBG_LOG(DBG_LEVEL_1, "MSR_WRITE: eip=%08lx, eax=%08x, edx=%08x", + eip, regs.eax, regs.edx); + /* just ignore this point */ + __get_instruction_length(inst_len); + __update_guest_eip(inst_len); + break; + case EXIT_REASON_MWAIT_INSTRUCTION: + __get_instruction_length(inst_len); + __update_guest_eip(inst_len); + vmx_vmexit_do_mwait(); + break; + default: + __vmx_bug(®s); /* should not happen */ + } + return; +} + +asmlinkage void load_cr2(void) +{ + struct exec_domain *d = current; + + local_irq_disable(); + asm volatile("movl %0,%%cr2": :"r" (d->thread.arch_vmx.cpu_cr2)); +} diff --git a/xen/arch/x86/vmx_io.c b/xen/arch/x86/vmx_io.c new file mode 100644 index 0000000000..881e297549 --- /dev/null +++ b/xen/arch/x86/vmx_io.c @@ -0,0 +1,234 @@ +/* + * vmx_io.c: handling I/O, interrupts related VMX entry/exit + * Copyright (c) 2004, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + * + */ +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +void vmx_io_assist(struct exec_domain *ed) +{ + vcpu_iodata_t *vio; + ioreq_t *p; + struct domain *d = ed->domain; + execution_context_t *ec = get_execution_context(); + unsigned long old_eax; + extern long do_block(); + unsigned long eflags; + int dir; + + /* clear the pending event */ + ed->vcpu_info->evtchn_upcall_pending = 0; + /* clear the pending bit for port 2 */ + clear_bit(IOPACKET_PORT>>5, &ed->vcpu_info->evtchn_pending_sel); + clear_bit(IOPACKET_PORT, &d->shared_info->evtchn_pending[0]); + + vio = (vcpu_iodata_t *) ed->thread.arch_vmx.vmx_platform.shared_page_va; + if (vio == 0) { + VMX_DBG_LOG(DBG_LEVEL_1, + "bad shared page: %lx\n", (unsigned long) vio); + domain_crash(); + } + p = &vio->vp_ioreq; + /* clear IO wait VMX flag */ + if (test_bit(ARCH_VMX_IO_WAIT, &ed->thread.arch_vmx.flags)) { + if (p->state != STATE_IORESP_READY) { + printk("got a false I/O reponse\n"); + do_block(); + } else { + p->state = STATE_INVALID; + } + clear_bit(ARCH_VMX_IO_WAIT, &ed->thread.arch_vmx.flags); + } else { + return; + } + + __vmread(GUEST_EFLAGS, &eflags); + dir = (eflags & X86_EFLAGS_DF); + + if (p->dir == IOREQ_WRITE) { + if (p->pdata_valid) { + if (!dir) + ec->esi += p->count * p->size; + else + ec->esi -= p->count * p->size; + ec->ecx -= p->count; + } + return; + } else { + if (p->pdata_valid) { + if (!dir) + ec->edi += p->count * p->size; + else + ec->edi -= p->count * p->size; + ec->ecx -= p->count; + return; + } + } + + old_eax = ec->eax; + + switch(p->size) { + case 1: + ec->eax = (old_eax & 0xffffff00) | (p->u.data & 0xff); + break; + case 2: + ec->eax = (old_eax & 0xffff0000) | (p->u.data & 0xffff); + break; + case 4: + ec->eax = (p->u.data & 0xffffffff); + break; + default: + BUG(); + } +} + +static inline int __fls(unsigned long word) +{ + int bit; + + __asm__("bsrl %1,%0" + :"=r" (bit) + :"rm" (word)); + return word ? bit : -1; +} + + +/* Simple minded Local APIC priority implementation. Fix later */ +static __inline__ int find_highest_irq(unsigned long *pintr) +{ + if (pintr[7]) + return __fls(pintr[7]) + (256-32*1); + if (pintr[6]) + return __fls(pintr[6]) + (256-32*2); + if (pintr[5]) + return __fls(pintr[5]) + (256-32*3); + if (pintr[4]) + return __fls(pintr[4]) + (256-32*4); + if (pintr[3]) + return __fls(pintr[3]) + (256-32*5); + if (pintr[2]) + return __fls(pintr[2]) + (256-32*6); + if (pintr[1]) + return __fls(pintr[1]) + (256-32*7); + return __fls(pintr[0]); +} + +/* + * Return 0-255 for pending irq. + * -1 when no pending. + */ +static inline int find_highest_pending_irq(struct exec_domain *d) +{ + vcpu_iodata_t *vio; + + vio = (vcpu_iodata_t *) d->thread.arch_vmx.vmx_platform.shared_page_va; + if (vio == 0) { + VMX_DBG_LOG(DBG_LEVEL_1, + "bad shared page: %lx\n", (unsigned long) vio); + domain_crash(); + } + + return find_highest_irq(&vio->vp_intr[0]); +} + +static inline void clear_highest_bit(struct exec_domain *d, int vector) +{ + vcpu_iodata_t *vio; + + vio = (vcpu_iodata_t *) d->thread.arch_vmx.vmx_platform.shared_page_va; + if (vio == 0) { + VMX_DBG_LOG(DBG_LEVEL_1, + "bad shared page: %lx\n", (unsigned long) vio); + domain_crash(); + } + + clear_bit(vector, &vio->vp_intr[0]); +} + +static inline int irq_masked(unsigned long eflags) +{ + return ((eflags & X86_EFLAGS_IF) == 0); +} + +void vmx_intr_assist(struct exec_domain *d) +{ + int highest_vector = find_highest_pending_irq(d); + unsigned long intr_fields, eflags; + + if (highest_vector == -1) + return; + + __vmread(VM_ENTRY_INTR_INFO_FIELD, &intr_fields); + if (intr_fields & INTR_INFO_VALID_MASK) { + VMX_DBG_LOG(DBG_LEVEL_1, "vmx_intr_assist: intr_fields: %lx\n", + intr_fields); + return; + } + + __vmread(GUEST_EFLAGS, &eflags); + if (irq_masked(eflags)) { + VMX_DBG_LOG(DBG_LEVEL_1, "guesting pending: %x, eflags: %lx\n", + highest_vector, eflags); + return; + } + + clear_highest_bit(d, highest_vector); + intr_fields = (INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR | highest_vector); + __vmwrite(VM_ENTRY_INTR_INFO_FIELD, intr_fields); + + __vmwrite(GUEST_INTERRUPTIBILITY_INFO, 0); + + return; +} + +void vmx_do_resume(struct exec_domain *d) +{ + extern long do_block(); + + __vmwrite(HOST_CR3, pagetable_val(d->mm.monitor_table)); + __vmwrite(GUEST_CR3, pagetable_val(d->mm.shadow_table)); + __vmwrite(HOST_ESP, (unsigned long) get_stack_top()); + + if (event_pending(d)) { + if (test_bit(IOPACKET_PORT, &d->domain->shared_info->evtchn_pending[0])) + vmx_io_assist(d); + + else if (test_bit(ARCH_VMX_IO_WAIT, &d->thread.arch_vmx.flags)) { + printk("got an event while blocked on I/O\n"); + do_block(); + } + + /* Assumption: device model will not inject an interrupt + * while an ioreq_t is pending i.e. the response and + * interrupt can come together. But an interrupt without + * a response to ioreq_t is not ok. + */ + } + if (!test_bit(ARCH_VMX_IO_WAIT, &d->thread.arch_vmx.flags)) + vmx_intr_assist(d); +} diff --git a/xen/arch/x86/vmx_vmcs.c b/xen/arch/x86/vmx_vmcs.c new file mode 100644 index 0000000000..755d481538 --- /dev/null +++ b/xen/arch/x86/vmx_vmcs.c @@ -0,0 +1,503 @@ +/* + * vmx_vmcs.c: VMCS management + * Copyright (c) 2004, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + * + */ + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +struct vmcs_struct *alloc_vmcs(void) +{ + struct vmcs_struct *vmcs; + unsigned int cpu_sig = cpuid_eax(0x00000001); + + vmcs = (struct vmcs_struct *) alloc_xenheap_pages(get_order(vmcs_size)); + memset((char *) vmcs, 0, vmcs_size); /* don't remove this */ + + vmcs->vmcs_revision_id = (cpu_sig > 0xf41)? 3 : 1; + return vmcs; +} + +void free_vmcs(struct vmcs_struct *vmcs) +{ + int order; + + order = (vmcs_size >> PAGE_SHIFT) - 1; + free_xenheap_pages((unsigned long) vmcs, order); +} + +static inline int construct_vmcs_controls(void) +{ + int error = 0; + + error |= __vmwrite(PIN_BASED_VM_EXEC_CONTROL, + MONITOR_PIN_BASED_EXEC_CONTROLS); + + error |= __vmwrite(CPU_BASED_VM_EXEC_CONTROL, + MONITOR_CPU_BASED_EXEC_CONTROLS); + + error |= __vmwrite(VM_EXIT_CONTROLS, MONITOR_VM_EXIT_CONTROLS); + error |= __vmwrite(VM_ENTRY_CONTROLS, MONITOR_VM_ENTRY_CONTROLS); + + return error; +} + +#define GUEST_SEGMENT_LIMIT 0xffffffff +#define HOST_SEGMENT_LIMIT 0xffffffff + +struct host_execution_env { + /* selectors */ + unsigned short ldtr_selector; + unsigned short tr_selector; + unsigned short ds_selector; + unsigned short cs_selector; + /* limits */ + unsigned short gdtr_limit; + unsigned short ldtr_limit; + unsigned short idtr_limit; + unsigned short tr_limit; + /* base */ + unsigned long gdtr_base; + unsigned long ldtr_base; + unsigned long idtr_base; + unsigned long tr_base; + unsigned long ds_base; + unsigned long cs_base; + /* control registers */ + unsigned long cr3; + unsigned long cr0; + unsigned long cr4; + unsigned long dr7; +}; + +#define round_pgdown(_p) ((_p)&PAGE_MASK) /* coped from domain.c */ + +int vmx_setup_platform(struct exec_domain *d, execution_context_t *context) +{ + int i; + unsigned int n; + unsigned long *p, mpfn, offset, addr; + struct e820entry *e820p; + unsigned long gpfn = 0; + + context->ebx = 0; /* Linux expects ebx to be 0 for boot proc */ + + n = context->ecx; + if (n > 32) { + VMX_DBG_LOG(DBG_LEVEL_1, "Too many e820 entries: %d\n", n); + return -1; + } + + addr = context->edi; + offset = (addr & ~PAGE_MASK); + addr = round_pgdown(addr); + mpfn = phys_to_machine_mapping[addr >> PAGE_SHIFT]; + p = map_domain_mem(mpfn << PAGE_SHIFT); + + e820p = (struct e820entry *) ((unsigned long) p + offset); + + for (i = 0; i < n; i++) { + if (e820p[i].type == E820_SHARED_PAGE) { + gpfn = (e820p[i].addr >> PAGE_SHIFT); + break; + } + } + + if (gpfn == 0) { + VMX_DBG_LOG(DBG_LEVEL_1, "No shared Page ?\n"); + return -1; + } + unmap_domain_mem(p); + + mpfn = phys_to_machine_mapping[gpfn]; + p = map_domain_mem(mpfn << PAGE_SHIFT); + d->thread.arch_vmx.vmx_platform.shared_page_va = (unsigned long) p; + + return 0; +} + + +/* + * Add mapping to per-domain mapping. Full + * virtualization does not need per-domain mapping. + */ +static int add_mapping_perdomain(struct exec_domain *d, unsigned long gpfn, + unsigned long mpfn) +{ + struct pfn_info *page; + unsigned long pfn = 0; + + /* + * We support up to 4GB memory for a guest at this point + */ + if (gpfn > ENTRIES_PER_L2_PAGETABLE * ENTRIES_PER_L1_PAGETABLE) + return -1; + + if (!(l1_pgentry_val(d->domain->mm_perdomain_pt[ + gpfn >> (L2_PAGETABLE_SHIFT - L1_PAGETABLE_SHIFT)]) & _PAGE_PRESENT)) + { + page = (struct pfn_info *) alloc_domheap_page(NULL); + if (!page) { + return -1; + } + + pfn = (unsigned long) (page - frame_table); + d->domain->mm_perdomain_pt[gpfn >> (L2_PAGETABLE_SHIFT - L1_PAGETABLE_SHIFT)] = + mk_l1_pgentry((pfn << PAGE_SHIFT) | __PAGE_HYPERVISOR); + } + phys_to_machine_mapping[gpfn] = mpfn; + + return 0; +} + +void vmx_do_launch(struct exec_domain *ed) +{ +/* Update CR3, GDT, LDT, TR */ + unsigned int tr, cpu, error = 0; + struct host_execution_env host_env; + struct Xgt_desc_struct desc; + struct list_head *list_ent; + l2_pgentry_t *mpl2e, *guest_pl2e_cache; + unsigned long i, pfn = 0; + struct pfn_info *page; + execution_context_t *ec = get_execution_context(); + struct domain *d = ed->domain; + + cpu = smp_processor_id(); + ed->mm.min_pfn = ed->mm.max_pfn = 0; + + spin_lock(&d->page_alloc_lock); + list_ent = d->page_list.next; + + mpl2e = (l2_pgentry_t *) map_domain_mem(pagetable_val(ed->mm.monitor_table)); + ASSERT(mpl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT]); + + for (i = 0; list_ent != &d->page_list; i++ ) { + pfn = list_entry(list_ent, struct pfn_info, list) - frame_table; + ed->mm.min_pfn = min(ed->mm.min_pfn, pfn); + ed->mm.max_pfn = max(ed->mm.max_pfn, pfn); + list_ent = frame_table[pfn].list.next; + add_mapping_perdomain(ed, i, pfn); + } + + spin_unlock(&d->page_alloc_lock); + + page = (struct pfn_info *) alloc_domheap_page(NULL); + pfn = (unsigned long) (page - frame_table); + + /* + * make linear_pt_table work for guest ptes + */ + mpl2e[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT] = + mk_l2_pgentry((pfn << PAGE_SHIFT)| __PAGE_HYPERVISOR); + + guest_pl2e_cache = map_domain_mem(pfn << PAGE_SHIFT); + memset(guest_pl2e_cache, 0, PAGE_SIZE); /* clean it up */ + ed->mm.guest_pl2e_cache = guest_pl2e_cache; + + unmap_domain_mem(mpl2e); + + vmx_setup_platform(ed, ec); + + __asm__ __volatile__ ("sgdt (%%eax) \n" :: "a"(&desc) : "memory"); + host_env.gdtr_limit = desc.size; + host_env.gdtr_base = desc.address; + + error |= __vmwrite(HOST_GDTR_BASE, host_env.gdtr_base); + + error |= __vmwrite(GUEST_LDTR_SELECTOR, 0); + error |= __vmwrite(GUEST_LDTR_BASE, 0); + error |= __vmwrite(GUEST_LDTR_LIMIT, 0); + + __asm__ __volatile__ ("str (%%eax) \n" :: "a"(&tr) : "memory"); + host_env.tr_selector = tr; + host_env.tr_limit = sizeof(struct tss_struct); + host_env.tr_base = (unsigned long) &init_tss[cpu]; + + error |= __vmwrite(HOST_TR_SELECTOR, host_env.tr_selector); + error |= __vmwrite(HOST_TR_BASE, host_env.tr_base); + error |= __vmwrite(GUEST_TR_BASE, 0); + error |= __vmwrite(GUEST_TR_LIMIT, 0xff); + + ed->mm.shadow_table = ed->mm.pagetable; + __vmwrite(GUEST_CR3, pagetable_val(ed->mm.pagetable)); + __vmwrite(HOST_CR3, pagetable_val(ed->mm.monitor_table)); + __vmwrite(HOST_ESP, (unsigned long) get_stack_top()); + + ed->thread.schedule_tail = arch_vmx_do_resume; +} + +/* + * Initially set the same environement as host. + */ +static inline int +construct_init_vmcs_guest(execution_context_t *context, + full_execution_context_t *full_context, + struct host_execution_env *host_env) +{ + int error = 0; + union vmcs_arbytes arbytes; + unsigned long dr7; + unsigned long eflags, shadow_cr; + + /* MSR */ + error |= __vmwrite(VM_EXIT_MSR_LOAD_ADDR, 0); + error |= __vmwrite(VM_EXIT_MSR_STORE_ADDR, 0); + + error |= __vmwrite(VM_EXIT_MSR_STORE_COUNT, 0); + error |= __vmwrite(VM_EXIT_MSR_LOAD_COUNT, 0); + error |= __vmwrite(VM_ENTRY_MSR_LOAD_COUNT, 0); + /* interrupt */ + error |= __vmwrite(VM_ENTRY_INTR_INFO_FIELD, 0); + /* mask */ + error |= __vmwrite(CR0_GUEST_HOST_MASK, 0xffffffff); + error |= __vmwrite(CR4_GUEST_HOST_MASK, 0xffffffff); + + error |= __vmwrite(PAGE_FAULT_ERROR_CODE_MASK, 0); + error |= __vmwrite(PAGE_FAULT_ERROR_CODE_MATCH, 0); + + /* TSC */ + error |= __vmwrite(TSC_OFFSET, 0); + error |= __vmwrite(CR3_TARGET_COUNT, 0); + + /* Guest Selectors */ + error |= __vmwrite(GUEST_CS_SELECTOR, context->cs); + error |= __vmwrite(GUEST_ES_SELECTOR, context->es); + error |= __vmwrite(GUEST_SS_SELECTOR, context->ss); + error |= __vmwrite(GUEST_DS_SELECTOR, context->ds); + error |= __vmwrite(GUEST_FS_SELECTOR, context->fs); + error |= __vmwrite(GUEST_GS_SELECTOR, context->gs); + + /* Guest segment Limits */ + error |= __vmwrite(GUEST_CS_LIMIT, GUEST_SEGMENT_LIMIT); + error |= __vmwrite(GUEST_ES_LIMIT, GUEST_SEGMENT_LIMIT); + error |= __vmwrite(GUEST_SS_LIMIT, GUEST_SEGMENT_LIMIT); + error |= __vmwrite(GUEST_DS_LIMIT, GUEST_SEGMENT_LIMIT); + error |= __vmwrite(GUEST_FS_LIMIT, GUEST_SEGMENT_LIMIT); + error |= __vmwrite(GUEST_GS_LIMIT, GUEST_SEGMENT_LIMIT); + + error |= __vmwrite(GUEST_IDTR_LIMIT, host_env->idtr_limit); + + /* AR bytes */ + arbytes.bytes = 0; + arbytes.fields.seg_type = 0x3; /* type = 3 */ + arbytes.fields.s = 1; /* code or data, i.e. not system */ + arbytes.fields.dpl = 0; /* DPL = 3 */ + arbytes.fields.p = 1; /* segment present */ + arbytes.fields.default_ops_size = 1; /* 32-bit */ + arbytes.fields.g = 1; + arbytes.fields.null_bit = 0; /* not null */ + + error |= __vmwrite(GUEST_ES_AR_BYTES, arbytes.bytes); + error |= __vmwrite(GUEST_SS_AR_BYTES, arbytes.bytes); + error |= __vmwrite(GUEST_DS_AR_BYTES, arbytes.bytes); + error |= __vmwrite(GUEST_FS_AR_BYTES, arbytes.bytes); + error |= __vmwrite(GUEST_GS_AR_BYTES, arbytes.bytes); + + arbytes.fields.seg_type = 0xb; /* type = 0xb */ + error |= __vmwrite(GUEST_CS_AR_BYTES, arbytes.bytes); + + error |= __vmwrite(GUEST_GDTR_BASE, context->edx); + context->edx = 0; + error |= __vmwrite(GUEST_GDTR_LIMIT, context->eax); + context->eax = 0; + + arbytes.fields.s = 0; /* not code or data segement */ + arbytes.fields.seg_type = 0x2; /* LTD */ + arbytes.fields.default_ops_size = 0; /* 16-bit */ + arbytes.fields.g = 0; + error |= __vmwrite(GUEST_LDTR_AR_BYTES, arbytes.bytes); + + arbytes.fields.seg_type = 0xb; /* 32-bit TSS (busy) */ + error |= __vmwrite(GUEST_TR_AR_BYTES, arbytes.bytes); + + error |= __vmwrite(GUEST_CR0, host_env->cr0); /* same CR0 */ + + /* Initally PG, PE are not set*/ + shadow_cr = host_env->cr0; + shadow_cr &= ~(X86_CR0_PE | X86_CR0_PG); + error |= __vmwrite(CR0_READ_SHADOW, shadow_cr); + /* CR3 is set in vmx_final_setup_guestos */ + error |= __vmwrite(GUEST_CR4, host_env->cr4); + shadow_cr = host_env->cr4; + shadow_cr &= ~(X86_CR4_PGE | X86_CR4_VMXE); + error |= __vmwrite(CR4_READ_SHADOW, shadow_cr); + + error |= __vmwrite(GUEST_ES_BASE, host_env->ds_base); + error |= __vmwrite(GUEST_CS_BASE, host_env->cs_base); + error |= __vmwrite(GUEST_SS_BASE, host_env->ds_base); + error |= __vmwrite(GUEST_DS_BASE, host_env->ds_base); + error |= __vmwrite(GUEST_FS_BASE, host_env->ds_base); + error |= __vmwrite(GUEST_GS_BASE, host_env->ds_base); + error |= __vmwrite(GUEST_IDTR_BASE, host_env->idtr_base); + + error |= __vmwrite(GUEST_ESP, context->esp); + error |= __vmwrite(GUEST_EIP, context->eip); + + eflags = context->eflags & ~VMCS_EFLAGS_RESERVED_0; /* clear 0s */ + eflags |= VMCS_EFLAGS_RESERVED_1; /* set 1s */ + + error |= __vmwrite(GUEST_EFLAGS, eflags); + + error |= __vmwrite(GUEST_INTERRUPTIBILITY_INFO, 0); + __asm__ __volatile__ ("mov %%dr7, %0\n" : "=r" (dr7)); + error |= __vmwrite(GUEST_DR7, dr7); + error |= __vmwrite(GUEST_VMCS0, 0xffffffff); + error |= __vmwrite(GUEST_VMCS1, 0xffffffff); + + return error; +} + +static inline int construct_vmcs_host(struct host_execution_env *host_env) +{ + int error = 0; + unsigned long crn; + struct Xgt_desc_struct desc; + + /* Host Selectors */ + host_env->ds_selector = __HYPERVISOR_DS; + error |= __vmwrite(HOST_ES_SELECTOR, host_env->ds_selector); + error |= __vmwrite(HOST_SS_SELECTOR, host_env->ds_selector); + error |= __vmwrite(HOST_DS_SELECTOR, host_env->ds_selector); + error |= __vmwrite(HOST_FS_SELECTOR, host_env->ds_selector); + error |= __vmwrite(HOST_GS_SELECTOR, host_env->ds_selector); + + host_env->cs_selector = __HYPERVISOR_CS; + error |= __vmwrite(HOST_CS_SELECTOR, host_env->cs_selector); + + host_env->ds_base = 0; + host_env->cs_base = 0; + error |= __vmwrite(HOST_FS_BASE, host_env->ds_base); + error |= __vmwrite(HOST_GS_BASE, host_env->ds_base); + +/* Debug */ + __asm__ __volatile__ ("sidt (%%eax) \n" :: "a"(&desc) : "memory"); + host_env->idtr_limit = desc.size; + host_env->idtr_base = desc.address; + error |= __vmwrite(HOST_IDTR_BASE, host_env->idtr_base); + + __asm__ __volatile__ ("movl %%cr0,%0" : "=r" (crn) : ); + host_env->cr0 = crn; + error |= __vmwrite(HOST_CR0, crn); /* same CR0 */ + + /* CR3 is set in vmx_final_setup_hostos */ + __asm__ __volatile__ ("movl %%cr4,%0" : "=r" (crn) : ); + host_env->cr4 = crn; + error |= __vmwrite(HOST_CR4, crn); + error |= __vmwrite(HOST_EIP, (unsigned long) vmx_asm_vmexit_handler); + + return error; +} + +/* + * Need to extend to support full virtualization. + * The variable use_host_env indicates if the new VMCS needs to use + * the same setups as the host has (xenolinux). + */ + +int construct_vmcs(struct arch_vmx_struct *arch_vmx, + execution_context_t *context, + full_execution_context_t *full_context, + int use_host_env) +{ + int error; + u64 vmcs_phys_ptr; + + struct host_execution_env host_env; + + if (use_host_env != VMCS_USE_HOST_ENV) + return -EINVAL; + + memset(&host_env, 0, sizeof(struct host_execution_env)); + + vmcs_phys_ptr = (u64) virt_to_phys(arch_vmx->vmcs); + + if ((error = __vmpclear (vmcs_phys_ptr))) { + printk("construct_vmcs: VMCLEAR failed\n"); + return -EINVAL; + } + if ((error = load_vmcs(arch_vmx, vmcs_phys_ptr))) { + printk("construct_vmcs: load_vmcs failed: VMCS = %lx\n", + (unsigned long) vmcs_phys_ptr); + return -EINVAL; + } + if ((error = construct_vmcs_controls())) { + printk("construct_vmcs: construct_vmcs_controls failed\n"); + return -EINVAL; + } + /* host selectors */ + if ((error = construct_vmcs_host(&host_env))) { + printk("construct_vmcs: construct_vmcs_host failed\n"); + return -EINVAL; + } + /* guest selectors */ + if ((error = construct_init_vmcs_guest(context, full_context, &host_env))) { + printk("construct_vmcs: construct_vmcs_guest failed\n"); + return -EINVAL; + } + + if ((error |= __vmwrite(EXCEPTION_BITMAP, + MONITOR_DEFAULT_EXCEPTION_BITMAP))) { + printk("construct_vmcs: setting Exception bitmap failed\n"); + return -EINVAL; + } + + return 0; +} + +int load_vmcs(struct arch_vmx_struct *arch_vmx, u64 phys_ptr) +{ + int error; + + if ((error = __vmptrld(phys_ptr))) { + clear_bit(ARCH_VMX_VMCS_LOADED, &arch_vmx->flags); + return error; + } + set_bit(ARCH_VMX_VMCS_LOADED, &arch_vmx->flags); + return 0; +} + +int store_vmcs(struct arch_vmx_struct *arch_vmx, u64 phys_ptr) +{ + /* take the current VMCS */ + __vmptrst(phys_ptr); + clear_bit(ARCH_VMX_VMCS_LOADED, &arch_vmx->flags); + return 0; +} + +void vm_launch_fail(unsigned long eflags) +{ + BUG(); +} + +void vm_resume_fail(unsigned long eflags) +{ + BUG(); +} + diff --git a/xen/arch/x86/x86_32/entry.S b/xen/arch/x86/x86_32/entry.S index b63d8203a0..226d5f7ddc 100644 --- a/xen/arch/x86/x86_32/entry.S +++ b/xen/arch/x86/x86_32/entry.S @@ -65,6 +65,104 @@ andl $~3,reg; \ movl (reg),reg; +#ifdef CONFIG_VMX +/* + * At VMExit time the processor saves the guest selectors, esp, eip, + * and eflags. Therefore we don't save them, but simply decrement + * the kernel stack pointer to make it consistent with the stack frame + * at usual interruption time. The eflags of the host is not saved by VMX, + * and we set it to the fixed value. + * + * We also need the room, especially because orig_eax field is used + * by do_IRQ(). Compared the xen_regs, we skip pushing for the following: + * (1/1) u16 error_code; + * (2/1) u16 entry_vector; + * (2) u32 eip; + * (3) u32 cs; + * (4) u32 eflags; + */ +#define VMX_MONITOR_EFLAGS 0x202 /* IF on */ +#define NR_SKIPPED_REGS 4 /* See the above explanation */ +#define VMX_SAVE_ALL_NOSEGREGS \ + pushl $VMX_MONITOR_EFLAGS; \ + popf; \ + subl $(NR_SKIPPED_REGS*4), %esp; \ + pushl %eax; \ + pushl %ebp; \ + pushl %edi; \ + pushl %esi; \ + pushl %edx; \ + pushl %ecx; \ + pushl %ebx; + +ENTRY(vmx_asm_vmexit_handler) + /* selectors are restored/saved by VMX */ + VMX_SAVE_ALL_NOSEGREGS + call SYMBOL_NAME(vmx_vmexit_handler) + jmp vmx_asm_do_resume + +ENTRY(vmx_asm_do_launch) + popl %ebx + popl %ecx + popl %edx + popl %esi + popl %edi + popl %ebp + popl %eax + addl $(NR_SKIPPED_REGS*4), %esp + /* VMLUANCH */ + .byte 0x0f,0x01,0xc2 + pushf + call SYMBOL_NAME(vm_launch_fail) + hlt + + ALIGN + +ENTRY(vmx_asm_do_resume) +vmx_test_all_events: + GET_CURRENT(%ebx) +/* test_all_events: */ + xorl %ecx,%ecx + notl %ecx + cli # tests must not race interrupts +/*test_softirqs:*/ + movl EDOMAIN_processor(%ebx),%eax + shl $6,%eax # sizeof(irq_cpustat) == 64 + test %ecx,SYMBOL_NAME(irq_stat)(%eax,1) + jnz vmx_process_softirqs + +vmx_restore_all_guest: + call SYMBOL_NAME(load_cr2) + /* + * Check if we are going back to VMX-based VM + * By this time, all the setups in the VMCS must be complete. + */ + popl %ebx + popl %ecx + popl %edx + popl %esi + popl %edi + popl %ebp + popl %eax + addl $(NR_SKIPPED_REGS*4), %esp + /* VMRESUME */ + .byte 0x0f,0x01,0xc3 + pushf + call SYMBOL_NAME(vm_resume_fail) + /* Should never reach here */ + hlt + + ALIGN +vmx_process_softirqs: + sti + call SYMBOL_NAME(do_softirq) + jmp vmx_test_all_events +#endif + +ENTRY(continue_nonidle_task) + GET_CURRENT(%ebx) + jmp test_all_events + ALIGN restore_all_guest: testb $TF_failsafe_return,EDOMAIN_thread_flags(%ebx) diff --git a/xen/common/event_channel.c b/xen/common/event_channel.c index d7b22caf63..017a27dc17 100644 --- a/xen/common/event_channel.c +++ b/xen/common/event_channel.c @@ -474,7 +474,7 @@ static long evtchn_close(evtchn_close_t *close) } -static long evtchn_send(int lport) +long evtchn_send(int lport) { struct domain *ld = current->domain; struct exec_domain *rd; diff --git a/xen/common/kernel.c b/xen/common/kernel.c index 3d43152d98..90a202014d 100644 --- a/xen/common/kernel.c +++ b/xen/common/kernel.c @@ -33,13 +33,13 @@ xmem_cache_t *exec_domain_struct_cachep; struct domain *dom0; vm_assist_info_t vm_assist_info[MAX_VMASST_TYPE + 1]; - +#if 0 struct e820entry { unsigned long addr_lo, addr_hi; /* start of memory segment */ unsigned long size_lo, size_hi; /* size of memory segment */ unsigned long type; /* type of memory segment */ }; - +#endif void start_of_day(void); /* opt_console: comma-separated list of console outputs. */ diff --git a/xen/common/softirq.c b/xen/common/softirq.c index 2a59925a07..ab2f243e15 100644 --- a/xen/common/softirq.c +++ b/xen/common/softirq.c @@ -13,6 +13,7 @@ #include #include #include +#include #include irq_cpustat_t irq_stat[NR_CPUS]; diff --git a/xen/include/asm-x86/config.h b/xen/include/asm-x86/config.h index 99f47071f1..1ac2eb358a 100644 --- a/xen/include/asm-x86/config.h +++ b/xen/include/asm-x86/config.h @@ -7,6 +7,7 @@ #ifndef __XEN_I386_CONFIG_H__ #define __XEN_I386_CONFIG_H__ +#define CONFIG_VMX 1 #define CONFIG_X86 1 #define CONFIG_SMP 1 diff --git a/xen/include/asm-x86/cpufeature.h b/xen/include/asm-x86/cpufeature.h index 8b2e913bff..6b02cb878a 100644 --- a/xen/include/asm-x86/cpufeature.h +++ b/xen/include/asm-x86/cpufeature.h @@ -71,6 +71,8 @@ #define X86_FEATURE_P4 (3*32+ 7) /* P4 */ /* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */ +#define X86_FEATURE_MWAIT (4*32+ 3) /* Monitor/Mwait support */ +#define X86_FEATURE_VMXE (4*32+ 5) /* Virtual Machine Extensions */ #define X86_FEATURE_EST (4*32+ 7) /* Enhanced SpeedStep */ /* VIA/Cyrix/Centaur-defined CPU features, CPUID level 0xC0000001, word 5 */ diff --git a/xen/include/asm-x86/e820.h b/xen/include/asm-x86/e820.h new file mode 100644 index 0000000000..080065ef5b --- /dev/null +++ b/xen/include/asm-x86/e820.h @@ -0,0 +1,42 @@ +/* + * structures and definitions for the int 15, ax=e820 memory map + * scheme. + * + * In a nutshell, arch/i386/boot/setup.S populates a scratch table + * in the empty_zero_block that contains a list of usable address/size + * duples. In arch/i386/kernel/setup.c, this information is + * transferred into the e820map, and in arch/i386/mm/init.c, that + * new information is used to mark pages reserved or not. + * + */ +#ifndef __E820_HEADER +#define __E820_HEADER + +#define E820MAP 0x2d0 /* our map */ +#define E820MAX 32 /* number of entries in E820MAP */ +#define E820NR 0x1e8 /* # entries in E820MAP */ + +#define E820_RAM 1 +#define E820_RESERVED 2 +#define E820_ACPI 3 /* usable as RAM once ACPI tables have been read */ +#define E820_NVS 4 +#define E820_IO 16 +#define E820_SHARED_PAGE 17 + +#define HIGH_MEMORY (1024*1024) + +#ifndef __ASSEMBLY__ + +struct e820map { + int nr_map; + struct e820entry { + unsigned long long addr; /* start of memory segment */ + unsigned long long size; /* size of memory segment */ + unsigned long type; /* type of memory segment */ + } map[E820MAX]; +}; + +extern struct e820map e820; +#endif/*!__ASSEMBLY__*/ + +#endif/*__E820_HEADER*/ diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h index 87ffe1ecc1..e392d588ca 100644 --- a/xen/include/asm-x86/mm.h +++ b/xen/include/asm-x86/mm.h @@ -215,10 +215,19 @@ void synchronise_pagetables(unsigned long cpu_mask); * contiguous (or near contiguous) physical memory. */ #undef machine_to_phys_mapping +/* + * The phys_to_machine_mapping is the reversed mapping of MPT for full + * virtualization. + */ +#undef phys_to_machine_mapping + #ifdef __x86_64__ extern unsigned long *machine_to_phys_mapping; #else #define machine_to_phys_mapping ((unsigned long *)RDWR_MPT_VIRT_START) +#ifdef CONFIG_VMX +#define phys_to_machine_mapping ((unsigned long *)PERDOMAIN_VIRT_START) +#endif #endif #define DEFAULT_GDT_ENTRIES (LAST_RESERVED_GDT_ENTRY+1) diff --git a/xen/include/asm-x86/msr.h b/xen/include/asm-x86/msr.h index a412963fd9..793860de54 100644 --- a/xen/include/asm-x86/msr.h +++ b/xen/include/asm-x86/msr.h @@ -84,6 +84,10 @@ #define MSR_MTRRcap 0x0fe #define MSR_IA32_BBL_CR_CTL 0x119 +#define MSR_IA32_SYSENTER_CS 0x174 +#define MSR_IA32_SYSENTER_ESP 0x175 +#define MSR_IA32_SYSENTER_EIP 0x176 + #define MSR_IA32_MCG_CAP 0x179 #define MSR_IA32_MCG_STATUS 0x17a #define MSR_IA32_MCG_CTL 0x17b diff --git a/xen/include/asm-x86/processor.h b/xen/include/asm-x86/processor.h index a23c4a2809..9935c9b2b6 100644 --- a/xen/include/asm-x86/processor.h +++ b/xen/include/asm-x86/processor.h @@ -16,6 +16,7 @@ #include #include #include +#include #include #endif @@ -84,6 +85,7 @@ #define X86_CR4_PCE 0x0100 /* enable performance counters at ipl 3 */ #define X86_CR4_OSFXSR 0x0200 /* enable fast FPU save and restore */ #define X86_CR4_OSXMMEXCPT 0x0400 /* enable unmasked SSE exceptions */ +#define X86_CR4_VMXE 0x2000 /* enable VMX */ /* * Trap/fault mnemonics. @@ -429,6 +431,9 @@ struct thread_struct { struct desc_struct fast_trap_desc; #endif trap_info_t traps[256]; +#ifdef CONFIG_VMX + struct arch_vmx_struct arch_vmx; /* Virtual Machine Extensions */ +#endif }; #define IDT_ENTRIES 256 @@ -473,6 +478,18 @@ struct mm_struct { l1_pgentry_t *perdomain_ptes; pagetable_t pagetable; +#ifdef CONFIG_VMX + +#define SHM_full_32 (8) /* full virtualization for 32-bit */ + + pagetable_t monitor_table; + l2_pgentry_t *vpagetable; /* virtual address of pagetable */ + l2_pgentry_t *shadow_vtable; /* virtual address of shadow_table */ + l2_pgentry_t *guest_pl2e_cache; /* guest page directory cache */ + unsigned long min_pfn; /* min host physical */ + unsigned long max_pfn; /* max host physical */ +#endif + /* shadow mode status and controls */ unsigned int shadow_mode; /* flags to control shadow table operation */ pagetable_t shadow_table; @@ -502,14 +519,25 @@ struct mm_struct { char gdt[10]; /* NB. 10 bytes needed for x86_64. Use 6 bytes for x86_32. */ }; +#define SHM_full_32 (8) /* full virtualization for 32-bit */ + static inline void write_ptbase(struct mm_struct *mm) { unsigned long pa; +#ifdef CONFIG_VMX + if ( unlikely(mm->shadow_mode) ) { + if (mm->shadow_mode == SHM_full_32) + pa = pagetable_val(mm->monitor_table); + else + pa = pagetable_val(mm->shadow_table); + } +#else if ( unlikely(mm->shadow_mode) ) - pa = pagetable_val(mm->shadow_table); + pa = pagetable_val(mm->shadow_table); +#endif else - pa = pagetable_val(mm->pagetable); + pa = pagetable_val(mm->pagetable); write_cr3(pa); } @@ -533,18 +561,40 @@ long set_gdt(struct exec_domain *d, long set_debugreg(struct exec_domain *p, int reg, unsigned long value); +struct microcode_header { + unsigned int hdrver; + unsigned int rev; + unsigned int date; + unsigned int sig; + unsigned int cksum; + unsigned int ldrver; + unsigned int pf; + unsigned int datasize; + unsigned int totalsize; + unsigned int reserved[3]; +}; + struct microcode { - unsigned int hdrver; - unsigned int rev; - unsigned int date; - unsigned int sig; - unsigned int cksum; - unsigned int ldrver; - unsigned int pf; - unsigned int reserved[5]; - unsigned int bits[500]; + struct microcode_header hdr; + unsigned int bits[0]; }; +typedef struct microcode microcode_t; +typedef struct microcode_header microcode_header_t; + +/* microcode format is extended from prescott processors */ +struct extended_signature { + unsigned int sig; + unsigned int pf; + unsigned int cksum; +}; + +struct extended_sigtable { + unsigned int count; + unsigned int cksum; + unsigned int reserved[3]; + struct extended_signature sigs[0]; +}; /* '6' because it used to be for P6 only (but now covers Pentium 4 as well) */ #define MICROCODE_IOCFREE _IO('6',0) diff --git a/xen/include/asm-x86/shadow.h b/xen/include/asm-x86/shadow.h index d94acbe707..29c9dbb5ca 100644 --- a/xen/include/asm-x86/shadow.h +++ b/xen/include/asm-x86/shadow.h @@ -17,6 +17,7 @@ #define SHM_logdirty (2) /* log pages that are dirtied */ #define SHM_translate (3) /* lookup machine pages in translation table */ #define SHM_cow (4) /* copy on write all dirtied pages */ +#define SHM_full_32 (8) /* full virtualization for 32-bit */ #define shadow_linear_pg_table ((l1_pgentry_t *)SH_LINEAR_PT_VIRT_START) #define shadow_linear_l2_table ((l2_pgentry_t *)(SH_LINEAR_PT_VIRT_START + \ @@ -37,6 +38,23 @@ extern void shadow_l2_normal_pt_update(unsigned long pa, unsigned long gpte); extern void unshadow_table(unsigned long gpfn, unsigned int type); extern int shadow_mode_enable(struct domain *p, unsigned int mode); +#ifdef CONFIG_VMX +extern void vmx_shadow_clear_state(struct mm_struct *); +extern void vmx_shadow_invlpg(struct mm_struct *, unsigned long); +#endif + +#define __get_machine_to_phys(m, guest_gpfn, gpfn) \ + if ((m)->shadow_mode == SHM_full_32) \ + (guest_gpfn) = machine_to_phys_mapping[(gpfn)]; \ + else \ + (guest_gpfn) = (gpfn); + +#define __get_phys_to_machine(m, host_gpfn, gpfn) \ + if ((m)->shadow_mode == SHM_full_32) \ + (host_gpfn) = phys_to_machine_mapping[(gpfn)]; \ + else \ + (host_gpfn) = (gpfn); + extern void __shadow_mode_disable(struct domain *d); static inline void shadow_mode_disable(struct domain *d) { @@ -46,8 +64,14 @@ static inline void shadow_mode_disable(struct domain *d) extern unsigned long shadow_l2_table( struct mm_struct *m, unsigned long gpfn); + +static inline void shadow_invalidate(struct mm_struct *m) { + if (m->shadow_mode != SHM_full_32) + BUG(); + memset(m->shadow_vtable, 0, PAGE_SIZE); +} -#define SHADOW_DEBUG 0 +#define SHADOW_DEBUG 0 #define SHADOW_HASH_DEBUG 0 struct shadow_status { @@ -80,9 +104,55 @@ printk("DOM%u: (file=shadow.c, line=%d) " _f "\n", \ printk("DOM%u: (file=shadow.c, line=%d) " _f "\n", \ current->id , __LINE__ , ## _a ) #else -#define SH_VVLOG(_f, _a...) +#define SH_VVLOG(_f, _a...) #endif +static inline void __shadow_get_pl2e(struct mm_struct *m, + unsigned long va, unsigned long *sl2e) +{ + if (m->shadow_mode == SHM_full_32) { + *sl2e = l2_pgentry_val(m->shadow_vtable[va >> L2_PAGETABLE_SHIFT]); + } + else + *sl2e = l2_pgentry_val(linear_l2_table[va >> L2_PAGETABLE_SHIFT]); +} + +static inline void __shadow_set_pl2e(struct mm_struct *m, + unsigned long va, unsigned long value) +{ + if (m->shadow_mode == SHM_full_32) { + m->shadow_vtable[va >> L2_PAGETABLE_SHIFT] = mk_l2_pgentry(value); + } + else + linear_l2_table[va >> L2_PAGETABLE_SHIFT] = mk_l2_pgentry(value); +} + +static inline void __guest_get_pl2e(struct mm_struct *m, + unsigned long va, unsigned long *l2e) +{ + if (m->shadow_mode == SHM_full_32) { + *l2e = l2_pgentry_val(m->vpagetable[va >> L2_PAGETABLE_SHIFT]); + } + else + *l2e = l2_pgentry_val(linear_l2_table[va >> L2_PAGETABLE_SHIFT]); +} + +static inline void __guest_set_pl2e(struct mm_struct *m, + unsigned long va, unsigned long value) +{ + if (m->shadow_mode == SHM_full_32) { + unsigned long pfn; + + pfn = phys_to_machine_mapping[value >> PAGE_SHIFT]; + m->guest_pl2e_cache[va >> L2_PAGETABLE_SHIFT] = + mk_l2_pgentry((pfn << PAGE_SHIFT) | __PAGE_HYPERVISOR); + + m->vpagetable[va >> L2_PAGETABLE_SHIFT] = mk_l2_pgentry(value); + } + else + linear_l2_table[va >> L2_PAGETABLE_SHIFT] = mk_l2_pgentry(value); + +} /************************************************************************/ @@ -151,7 +221,6 @@ static inline void l1pte_write_fault( unsigned long spte = *spte_p; ASSERT(gpte & _PAGE_RW); - gpte |= _PAGE_DIRTY | _PAGE_ACCESSED; switch ( m->shadow_mode ) @@ -163,9 +232,19 @@ static inline void l1pte_write_fault( case SHM_logdirty: spte = gpte | _PAGE_RW; __mark_dirty(m, gpte >> PAGE_SHIFT); + + case SHM_full_32: + { + unsigned long host_pfn, host_gpte; + + host_pfn = phys_to_machine_mapping[gpte >> PAGE_SHIFT]; + host_gpte = (host_pfn << PAGE_SHIFT) | (gpte & ~PAGE_MASK); + spte = host_gpte | _PAGE_RW; + } break; } + SH_VVLOG("updating spte=%lx gpte=%lx", spte, gpte); *gpte_p = gpte; *spte_p = spte; } @@ -187,6 +266,17 @@ static inline void l1pte_read_fault( case SHM_logdirty: spte = gpte & ~_PAGE_RW; break; + + case SHM_full_32: + { + unsigned long host_pfn, host_gpte; + + host_pfn = phys_to_machine_mapping[gpte >> PAGE_SHIFT]; + host_gpte = (host_pfn << PAGE_SHIFT) | (gpte & ~PAGE_MASK); + spte = (host_gpte & _PAGE_DIRTY) ? host_gpte : (host_gpte & ~_PAGE_RW); + } + break; + } *gpte_p = gpte; @@ -214,6 +304,20 @@ static inline void l1pte_propagate_from_guest( (_PAGE_PRESENT|_PAGE_ACCESSED) ) spte = gpte & ~_PAGE_RW; break; + + case SHM_full_32: + { + unsigned long host_pfn, host_gpte; + + host_pfn = phys_to_machine_mapping[gpte >> PAGE_SHIFT]; + host_gpte = (host_pfn << PAGE_SHIFT) | (gpte & ~PAGE_MASK); + spte = 0; + + if ( (host_gpte & (_PAGE_PRESENT|_PAGE_ACCESSED) ) == + (_PAGE_PRESENT|_PAGE_ACCESSED) ) + spte = (host_gpte & _PAGE_DIRTY) ? host_gpte : (host_gpte & ~_PAGE_RW); + } + break; } *gpte_p = gpte; @@ -239,8 +343,12 @@ static inline void l2pde_general( /* Detect linear p.t. mappings and write-protect them. */ if ( (frame_table[sl1pfn].u.inuse.type_info & PGT_type_mask) == - PGT_l2_page_table ) - spde = gpde & ~_PAGE_RW; + PGT_l2_page_table ) + { + if (m->shadow_mode != SHM_full_32) + spde = gpde & ~_PAGE_RW; + + } } *gpde_p = gpde; @@ -394,7 +502,7 @@ static inline void delete_shadow_status( head = hash_bucket(m, gpfn); - SH_VVLOG("delete gpfn=%08x bucket=%p", gpfn, b); + SH_VVLOG("delete gpfn=%08x bucket=%p", gpfn, head); shadow_audit(m, 0); /* Match on head item? */ @@ -469,7 +577,7 @@ static inline void set_shadow_status( x = head = hash_bucket(m, gpfn); - SH_VVLOG("set gpfn=%08x s=%08lx bucket=%p(%p)", gpfn, s, b, b->next); + SH_VVLOG("set gpfn=%08x s=%08lx bucket=%p(%p)", gpfn, s, x, x->next); shadow_audit(m, 0); /* @@ -543,7 +651,72 @@ static inline void set_shadow_status( done: shadow_audit(m, 0); } + +#ifdef CONFIG_VMX +#include + +static inline void vmx_update_shadow_state( + struct mm_struct *mm, unsigned long gpfn, unsigned long spfn) +{ + + l2_pgentry_t *mpl2e = 0; + l2_pgentry_t *gpl2e, *spl2e; + + /* unmap the old mappings */ + if (mm->shadow_vtable) + unmap_domain_mem(mm->shadow_vtable); + if (mm->vpagetable) + unmap_domain_mem(mm->vpagetable); + + /* new mapping */ + mpl2e = (l2_pgentry_t *) + map_domain_mem(pagetable_val(mm->monitor_table)); + + mpl2e[SH_LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT] = + mk_l2_pgentry((spfn << PAGE_SHIFT) | __PAGE_HYPERVISOR); + __flush_tlb_one(SH_LINEAR_PT_VIRT_START); + + spl2e = (l2_pgentry_t *) map_domain_mem(spfn << PAGE_SHIFT); + gpl2e = (l2_pgentry_t *) map_domain_mem(gpfn << PAGE_SHIFT); + memset(spl2e, 0, ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t)); + mm->shadow_table = mk_pagetable(spfn<shadow_vtable = spl2e; + mm->vpagetable = gpl2e; /* expect the guest did clean this up */ + unmap_domain_mem(mpl2e); +} + +static inline void __shadow_mk_pagetable( struct mm_struct *mm ) +{ + unsigned long gpfn = pagetable_val(mm->pagetable) >> PAGE_SHIFT; + unsigned long spfn; + SH_VLOG("0: __shadow_mk_pagetable(gpfn=%08lx\n", gpfn); + + if (mm->shadow_mode == SHM_full_32) + { + unsigned long guest_gpfn; + guest_gpfn = machine_to_phys_mapping[gpfn]; + + SH_VVLOG("__shadow_mk_pagetable(guest_gpfn=%08lx, gpfn=%08lx\n", + guest_gpfn, gpfn); + + spfn = __shadow_status(mm, gpfn) & PSH_pfn_mask; + if ( unlikely(spfn == 0) ) { + spfn = shadow_l2_table(mm, gpfn); + mm->shadow_table = mk_pagetable(spfn<shadow_table = mk_pagetable(spfn<pagetable) >> PAGE_SHIFT; @@ -554,22 +727,26 @@ static inline void __shadow_mk_pagetable(struct mm_struct *mm) mm->shadow_table = mk_pagetable(spfn << PAGE_SHIFT); } +#endif /* CONFIG_VMX */ static inline void shadow_mk_pagetable(struct mm_struct *mm) { - SH_VVLOG("shadow_mk_pagetable( gptbase=%08lx, mode=%d )", - pagetable_val(mm->pagetable), mm->shadow_mode ); - - if ( unlikely(mm->shadow_mode) ) - { - shadow_lock(mm); - __shadow_mk_pagetable(mm); - shadow_unlock(mm); - } - - SH_VVLOG("leaving shadow_mk_pagetable( gptbase=%08lx, mode=%d ) sh=%08lx", - pagetable_val(mm->pagetable), mm->shadow_mode, - pagetable_val(mm->shadow_table) ); + if ( unlikely(mm->shadow_mode) ) + { + SH_VVLOG("shadow_mk_pagetable( gptbase=%08lx, mode=%d )", + pagetable_val(mm->pagetable), mm->shadow_mode ); + + shadow_lock(mm); + __shadow_mk_pagetable(mm); + shadow_unlock(mm); + + SH_VVLOG("leaving shadow_mk_pagetable:\n"); + + SH_VVLOG("( gptbase=%08lx, mode=%d ) sh=%08lx", + pagetable_val(mm->pagetable), mm->shadow_mode, + pagetable_val(mm->shadow_table) ); + + } } #if SHADOW_DEBUG diff --git a/xen/include/asm-x86/vmx.h b/xen/include/asm-x86/vmx.h new file mode 100644 index 0000000000..b59f8d3216 --- /dev/null +++ b/xen/include/asm-x86/vmx.h @@ -0,0 +1,251 @@ +/* + * vmx.h: VMX Architecture related definitions + * Copyright (c) 2004, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + * + */ +#ifndef __ASM_X86_VMX_H__ +#define __ASM_X86_VMX_H__ + +#include +#include +#include +#include +#include + +extern void vmx_asm_vmexit_handler(struct xen_regs); +extern void vmx_asm_do_resume(void); +extern void vmx_asm_do_launch(void); +extern void vmx_intr_assist(struct exec_domain *d); + +extern void arch_vmx_do_launch(struct exec_domain *); +extern void arch_vmx_do_resume(struct exec_domain *); + +extern int vmcs_size; +extern unsigned int cpu_rev; + +/* + * Need fill bits for SENTER + */ + +#define MONITOR_PIN_BASED_EXEC_CONTROLS 0x0000001f +#define MONITOR_CPU_BASED_EXEC_CONTROLS 0x0581e7f2 +#define MONITOR_VM_EXIT_CONTROLS 0x0003edff +#define MONITOR_VM_ENTRY_CONTROLS 0x000011ff + +/* + * Exit Reasons + */ +#define VMX_EXIT_REASONS_FAILED_VMENTRY 0x80000000 + +#define EXIT_REASON_EXCEPTION_NMI 0 +#define EXIT_REASON_EXTERNAL_INTERRUPT 1 + +#define EXIT_REASON_PENDING_INTERRUPT 7 + +#define EXIT_REASON_TASK_SWITCH 9 +#define EXIT_REASON_CPUID 10 +#define EXIT_REASON_HLT 12 +#define EXIT_REASON_INVLPG 14 +#define EXIT_REASON_RDPMC 15 +#define EXIT_REASON_RDTSC 16 +#define EXIT_REASON_VMCALL 18 + +#define EXIT_REASON_CR_ACCESS 28 +#define EXIT_REASON_DR_ACCESS 29 +#define EXIT_REASON_IO_INSTRUCTION 30 +#define EXIT_REASON_MSR_READ 31 +#define EXIT_REASON_MSR_WRITE 32 +#define EXIT_REASON_MWAIT_INSTRUCTION 36 + +/* + * Interruption-information format + */ +#define INTR_INFO_VECTOR_MASK 0xff /* 7:0 */ +#define INTR_INFO_INTR_TYPE_MASK 0x700 /* 10:8 */ +#define INTR_INFO_DELIEVER_CODE_MASK 0x800 /* 11 */ +#define INTR_INFO_VALID_MASK 0x80000000 /* 31 */ + +#define INTR_TYPE_EXT_INTR (0 << 8) /* external interrupt */ +#define INTR_TYPE_EXCEPTION (3 << 8) /* processor exception */ + +/* + * Exit Qualifications for MOV for Control Register Access + */ +#define CONTROL_REG_ACCESS_NUM 0x7 /* 2:0, number of control register */ +#define CONTROL_REG_ACCESS_TYPE 0x30 /* 5:4, access type */ +#define TYPE_MOV_TO_CR (0 << 4) +#define TYPE_MOV_FROM_CR (1 << 4) +#define TYPE_CLTS (2 << 4) +#define CONTROL_REG_ACCESS_REG 0x700 /* 10:8, general purpose register */ +#define REG_EAX (0 << 8) +#define REG_ECX (1 << 8) +#define REG_EDX (2 << 8) +#define REG_EBX (3 << 8) +#define REG_ESP (4 << 8) +#define REG_EBP (5 << 8) +#define REG_ESI (6 << 8) +#define REG_EDI (7 << 8) + +/* + * Exit Qualifications for MOV for Debug Register Access + */ +#define DEBUG_REG_ACCESS_NUM 0x7 /* 2:0, number of debug register */ +#define DEBUG_REG_ACCESS_TYPE 0x10 /* 4, direction of access */ +#define TYPE_MOV_TO_DR (0 << 4) +#define TYPE_MOV_FROM_DR (1 << 4) +#define DEBUG_REG_ACCESS_REG 0x700 /* 11:8, general purpose register */ + +#define EXCEPTION_BITMAP_DE (1 << 0) /* Divide Error */ +#define EXCEPTION_BITMAP_DB (1 << 1) /* Debug */ +#define EXCEPTION_BITMAP_NMI (1 << 2) /* NMI */ +#define EXCEPTION_BITMAP_BP (1 << 3) /* Breakpoint */ +#define EXCEPTION_BITMAP_OF (1 << 4) /* Overflow */ +#define EXCEPTION_BITMAP_BR (1 << 5) /* BOUND Range Exceeded */ +#define EXCEPTION_BITMAP_UD (1 << 6) /* Invalid Opcode */ +#define EXCEPTION_BITMAP_NM (1 << 7) /* Device Not Available */ +#define EXCEPTION_BITMAP_DF (1 << 8) /* Double Fault */ +/* reserved */ +#define EXCEPTION_BITMAP_TS (1 << 10) /* Invalid TSS */ +#define EXCEPTION_BITMAP_NP (1 << 11) /* Segment Not Present */ +#define EXCEPTION_BITMAP_SS (1 << 12) /* Stack-Segment Fault */ +#define EXCEPTION_BITMAP_GP (1 << 13) /* General Protection */ +#define EXCEPTION_BITMAP_PG (1 << 14) /* Page Fault */ +#define EXCEPTION_BITMAP_MF (1 << 16) /* x87 FPU Floating-Point Error (Math Fault) */ +#define EXCEPTION_BITMAP_AC (1 << 17) /* Alignment Check */ +#define EXCEPTION_BITMAP_MC (1 << 18) /* Machine Check */ +#define EXCEPTION_BITMAP_XF (1 << 19) /* SIMD Floating-Point Exception */ + +#ifdef XEN_DEBUGGER +#define MONITOR_DEFAULT_EXCEPTION_BITMAP \ + ( EXCEPTION_BITMAP_PG | \ + EXCEPTION_BITMAP_DB | \ + EXCEPTION_BITMAP_BP | \ + EXCEPTION_BITMAP_GP ) +#else +#define MONITOR_DEFAULT_EXCEPTION_BITMAP \ + ( EXCEPTION_BITMAP_PG | \ + EXCEPTION_BITMAP_GP ) +#endif + +#define VMCALL_OPCODE ".byte 0x0f,0x01,0xc1\n" +#define VMCLEAR_OPCODE ".byte 0x66,0x0f,0xc7\n" /* reg/opcode: /6 */ +#define VMLAUNCH_OPCODE ".byte 0x0f,0x01,0xc2\n" +#define VMPTRLD_OPCODE ".byte 0x0f,0xc7\n" /* reg/opcode: /6 */ +#define VMPTRST_OPCODE ".byte 0x0f,0xc7\n" /* reg/opcode: /7 */ +#define VMREAD_OPCODE ".byte 0x0f,0x78\n" +#define VMRESUME_OPCODE ".byte 0x0f,0x01,0xc3\n" +#define VMWRITE_OPCODE ".byte 0x0f,0x79\n" +#define VMXOFF_OPCODE ".byte 0x0f,0x01,0xc4\n" +#define VMXON_OPCODE ".byte 0xf3,0x0f,0xc7\n" + +#define MODRM_EAX_06 ".byte 0x30\n" /* [EAX], with reg/opcode: /6 */ +#define MODRM_EAX_07 ".byte 0x38\n" /* [EAX], with reg/opcode: /7 */ +#define MODRM_EAX_ECX ".byte 0xc1\n" /* [EAX], [ECX] */ + +static inline int __vmptrld (u64 addr) +{ + unsigned long eflags; + __asm__ __volatile__ ( VMPTRLD_OPCODE + MODRM_EAX_06 + : + : "a" (&addr) + : "memory"); + + __save_flags(eflags); + if (eflags & X86_EFLAGS_ZF || eflags & X86_EFLAGS_CF) + return -1; + return 0; +} + +static inline void __vmptrst (u64 addr) +{ + __asm__ __volatile__ ( VMPTRST_OPCODE + MODRM_EAX_07 + : + : "a" (&addr) + : "memory"); +} + +static inline int __vmpclear (u64 addr) +{ + unsigned long eflags; + + __asm__ __volatile__ ( VMCLEAR_OPCODE + MODRM_EAX_06 + : + : "a" (&addr) + : "memory"); + __save_flags(eflags); + if (eflags & X86_EFLAGS_ZF || eflags & X86_EFLAGS_CF) + return -1; + return 0; +} + +static inline int __vmread (unsigned int field, void *value) +{ + unsigned long eflags; + unsigned long ecx = 0; + + __asm__ __volatile__ ( VMREAD_OPCODE + MODRM_EAX_ECX + : "=c" (ecx) + : "a" (field) + : "memory"); + + *((long *) value) = ecx; + + __save_flags(eflags); + if (eflags & X86_EFLAGS_ZF || eflags & X86_EFLAGS_CF) + return -1; + return 0; +} + +static inline int __vmwrite (unsigned int field, unsigned int value) +{ + unsigned long eflags; + + __asm__ __volatile__ ( VMWRITE_OPCODE + MODRM_EAX_ECX + : + : "a" (field) , "c" (value) + : "memory"); + __save_flags(eflags); + if (eflags & X86_EFLAGS_ZF || eflags & X86_EFLAGS_CF) + return -1; + return 0; +} + +static inline void __vmxoff (void) +{ + __asm__ __volatile__ ( VMXOFF_OPCODE + ::: "memory"); +} + +static inline int __vmxon (u64 addr) +{ + unsigned long eflags; + + __asm__ __volatile__ ( VMXON_OPCODE + MODRM_EAX_06 + : + : "a" (&addr) + : "memory"); + __save_flags(eflags); + if (eflags & X86_EFLAGS_ZF || eflags & X86_EFLAGS_CF) + return -1; + return 0; +} +#endif /* __ASM_X86_VMX_H__ */ diff --git a/xen/include/asm-x86/vmx_cpu.h b/xen/include/asm-x86/vmx_cpu.h new file mode 100644 index 0000000000..2cccc151dd --- /dev/null +++ b/xen/include/asm-x86/vmx_cpu.h @@ -0,0 +1,35 @@ +/* + * vmx_cpu.h: Virtual CPU state + * Copyright (c) 2004, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + * + */ +#ifndef __ASM_X86_VMX_VMCS_H__ +#define __ASM_X86_VMX_VMCS_H__ + +/* + * Virtual CPU + */ +struct arch_state_struct { + unsigned long mode_flags; /* vm86, 32-bit, 64-bit, etc. */ + /* debug registers */ + /* MSRs */ +}; + +#define VMX_MF_VM86 0 +#define VMX_MF_32 1 +#define VMX_MF_64 2 + +#endif diff --git a/xen/include/asm-x86/vmx_platform.h b/xen/include/asm-x86/vmx_platform.h new file mode 100644 index 0000000000..f2b8a030c1 --- /dev/null +++ b/xen/include/asm-x86/vmx_platform.h @@ -0,0 +1,24 @@ +/* + * vmx_platform.h: VMX platform support + * Copyright (c) 2004, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + * + */ +#ifndef __ASM_X86_VMX_PLATFORM_H__ +#define __ASM_X86_VMX_PLATFORM_H__ + +#include /* from Linux */ + +#endif diff --git a/xen/include/asm-x86/vmx_vmcs.h b/xen/include/asm-x86/vmx_vmcs.h new file mode 100644 index 0000000000..8ec77d8ed5 --- /dev/null +++ b/xen/include/asm-x86/vmx_vmcs.h @@ -0,0 +1,225 @@ +/* + * vmx_vmcs.h: VMCS related definitions + * Copyright (c) 2004, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + * + */ +#ifndef __ASM_X86_VMX_VMCS_H__ +#define __ASM_X86_VMX_VMCS_H__ + +#include +#include +#include + +extern int start_vmx(void); +extern void stop_vmx(void); + +void vmx_enter_scheduler(void); + +union vmcs_arbytes { + struct arbyte_fields { + unsigned int + seg_type: 4, s: 1, dpl: 2, p: 1, + reserved0: 4, avl: 1, reserved1: 1, + default_ops_size: 1, g: 1, null_bit: 1, + reserved2: 15; + } __attribute__((packed)) fields; + unsigned int bytes; +}; + +struct virutal_platform_def { + unsigned long *real_mode_data; /* E820, etc. */ + unsigned long shared_page_va; +}; + +int vmx_setup_platform(struct exec_domain *, execution_context_t *); + +#define VMX_CPU_STATE_PG_ENABLED 0 + +#define VMCS_SIZE 0x1000 + +struct vmcs_struct { + u32 vmcs_revision_id; + unsigned char data [0x1000 - sizeof (u32)]; +}; + +struct arch_vmx_struct { + struct vmcs_struct *vmcs; /* VMCS pointer in virtual */ + unsigned long flags; /* VMCS flags */ + unsigned long cpu_cr2; /* save CR2 */ + unsigned long cpu_cr3; + unsigned long cpu_state; + struct virutal_platform_def vmx_platform; +#if 0 + /* open */ + unsigned long *page_list; /* page list for MMIO */ +#endif +}; + +#define vmx_schedule_tail(next) \ + (next)->thread.arch_vmx.arch_vmx_schedule_tail((next)) + +#define VMX_DOMAIN(d) d->thread.arch_vmx.flags + +#define ARCH_VMX_VMCS_LOADED 0 /* VMCS has been loaded and active */ +#define ARCH_VMX_VMCS_LAUNCH 1 /* Needs VMCS launch */ +#define ARCH_VMX_VMCS_RESUME 2 /* Needs VMCS resume */ +#define ARCH_VMX_IO_WAIT 3 /* Waiting for I/O completion */ + +void vmx_do_launch(struct exec_domain *); +void vmx_do_resume(struct exec_domain *); + +struct vmcs_struct *alloc_vmcs(void); +void free_vmcs(struct vmcs_struct *); +int load_vmcs(struct arch_vmx_struct *, u64); +int store_vmcs(struct arch_vmx_struct *, u64); +void dump_vmcs(void); +int construct_vmcs(struct arch_vmx_struct *, execution_context_t *, + full_execution_context_t *, int); + +#define VMCS_USE_HOST_ENV 1 +#define VMCS_USE_SEPARATE_ENV 0 + +#define VMCS_EFLAGS_RESERVED_0 0xffc08028 /* bitmap for 0 */ +#define VMCS_EFLAGS_RESERVED_1 0x00000002 /* bitmap for 1 */ + +extern int vmcs_version; + +/* VMCS Encordings */ +enum vmcs_field { + GUEST_ES_SELECTOR = 0x00000800, + GUEST_CS_SELECTOR = 0x00000802, + GUEST_SS_SELECTOR = 0x00000804, + GUEST_DS_SELECTOR = 0x00000806, + GUEST_FS_SELECTOR = 0x00000808, + GUEST_GS_SELECTOR = 0x0000080a, + GUEST_LDTR_SELECTOR = 0x0000080c, + GUEST_TR_SELECTOR = 0x0000080e, + HOST_ES_SELECTOR = 0x00000c00, + HOST_CS_SELECTOR = 0x00000c02, + HOST_SS_SELECTOR = 0x00000c04, + HOST_DS_SELECTOR = 0x00000c06, + HOST_FS_SELECTOR = 0x00000c08, + HOST_GS_SELECTOR = 0x00000c0a, + HOST_TR_SELECTOR = 0x00000c0c, + IO_BITMAP_A = 0x00002000, + IO_BITMAP_B = 0x00002002, + VM_EXIT_MSR_STORE_ADDR = 0x00002006, + VM_EXIT_MSR_LOAD_ADDR = 0x00002008, + VM_ENTRY_MSR_LOAD_ADDR = 0x0000200a, + TSC_OFFSET = 0x00002010, + GUEST_VMCS0 = 0x00002800, + GUEST_VMCS1 = 0x00002801, + GUEST_IA32_DEBUGCTL = 0x00002802, + PIN_BASED_VM_EXEC_CONTROL = 0x00004000, + CPU_BASED_VM_EXEC_CONTROL = 0x00004002, + EXCEPTION_BITMAP = 0x00004004, + PAGE_FAULT_ERROR_CODE_MASK = 0x00004006, + PAGE_FAULT_ERROR_CODE_MATCH = 0x00004008, + CR3_TARGET_COUNT = 0x0000400a, + VM_EXIT_CONTROLS = 0x0000400c, + VM_EXIT_MSR_STORE_COUNT = 0x0000400e, + VM_EXIT_MSR_LOAD_COUNT = 0x00004010, + VM_ENTRY_CONTROLS = 0x00004012, + VM_ENTRY_MSR_LOAD_COUNT = 0x00004014, + VM_ENTRY_INTR_INFO_FIELD = 0x00004016, + VM_ENTRY_EXCEPTION_ERROR_CODE = 0x00004018, + VM_EXIT_REASON = 0x00004402, + VM_EXIT_INTR_INFO = 0x00004404, + VM_EXIT_INTR_ERROR_CODE = 0x00004406, + IDT_VECTORING_INFO_FIELD = 0x00004408, + IDT_VECTORING_ERROR_CODE = 0x0000440a, + INSTRUCTION_LEN = 0x0000440c, + GUEST_ES_LIMIT = 0x00004800, + GUEST_CS_LIMIT = 0x00004802, + GUEST_SS_LIMIT = 0x00004804, + GUEST_DS_LIMIT = 0x00004806, + GUEST_FS_LIMIT = 0x00004808, + GUEST_GS_LIMIT = 0x0000480a, + GUEST_LDTR_LIMIT = 0x0000480c, + GUEST_TR_LIMIT = 0x0000480e, + GUEST_GDTR_LIMIT = 0x00004810, + GUEST_IDTR_LIMIT = 0x00004812, + GUEST_ES_AR_BYTES = 0x00004814, + GUEST_CS_AR_BYTES = 0x00004816, + GUEST_SS_AR_BYTES = 0x00004818, + GUEST_DS_AR_BYTES = 0x0000481a, + GUEST_FS_AR_BYTES = 0x0000481c, + GUEST_GS_AR_BYTES = 0x0000481e, + GUEST_LDTR_AR_BYTES = 0x00004820, + GUEST_TR_AR_BYTES = 0x00004822, + GUEST_INTERRUPTIBILITY_INFO = 0x00004824, + CR0_GUEST_HOST_MASK = 0x00006000, + CR4_GUEST_HOST_MASK = 0x00006002, + CR0_READ_SHADOW = 0x00006004, + CR4_READ_SHADOW = 0x00006006, + CR3_TARGET_VALUES = 0x00006008, + CR3_GUEST_HOST_MASK = 0x00006208, + EXIT_QUALIFICATION = 0x00006400, + GUEST_CR0 = 0x00006800, + GUEST_CR3 = 0x00006802, + GUEST_CR4 = 0x00006804, + GUEST_ES_BASE = 0x00006806, + GUEST_CS_BASE = 0x00006808, + GUEST_SS_BASE = 0x0000680a, + GUEST_DS_BASE = 0x0000680c, + GUEST_FS_BASE = 0x0000680e, + GUEST_GS_BASE = 0x00006810, + GUEST_LDTR_BASE = 0x00006812, + GUEST_TR_BASE = 0x00006814, + GUEST_GDTR_BASE = 0x00006816, + GUEST_IDTR_BASE = 0x00006818, + GUEST_DR7 = 0x0000681a, + GUEST_ESP = 0x0000681c, + GUEST_EIP = 0x0000681e, + GUEST_EFLAGS = 0x00006820, + GUEST_PENDING_DBG_EXCEPTIONS = 0x00006822, + HOST_CR0 = 0x00006c00, + HOST_CR3 = 0x00006c02, + HOST_CR4 = 0x00006c04, + HOST_FS_BASE = 0x00006c06, + HOST_GS_BASE = 0x00006c08, + HOST_TR_BASE = 0x00006c0a, + HOST_GDTR_BASE = 0x00006c0c, + HOST_IDTR_BASE = 0x00006c0e, + HOST_ESP = 0x00006c14, + HOST_EIP = 0x00006c16, +}; + +#define VMX_DEBUG 1 +#if VMX_DEBUG +#define DBG_LEVEL_0 (1 << 0) +#define DBG_LEVEL_1 (1 << 1) +#define DBG_LEVEL_2 (1 << 2) +#define DBG_LEVEL_3 (1 << 3) +#define DBG_LEVEL_IO (1 << 4) +#define DBG_LEVEL_VMMU (1 << 5) + +extern unsigned int opt_vmx_debug_level; +#define VMX_DBG_LOG(level, _f, _a...) \ + if ((level) & opt_vmx_debug_level) \ + printk("[VMX]" _f "\n", ## _a ) +#else +#define VMX_DBG_LOG(level, _f, _a...) +#endif + +#define __vmx_bug(regs) \ + do { \ + printk("__vmx_bug at %s:%d\n", __FILE__, __LINE__); \ + show_registers(regs); \ + domain_crash(); \ + } while (0) + +#endif /* ASM_X86_VMX_VMCS_H__ */ diff --git a/xen/include/public/arch-x86_32.h b/xen/include/public/arch-x86_32.h index b7210fc1b5..a66bf2b55d 100644 --- a/xen/include/public/arch-x86_32.h +++ b/xen/include/public/arch-x86_32.h @@ -114,6 +114,7 @@ typedef u64 tsc_timestamp_t; /* RDTSC timestamp */ */ typedef struct { #define ECF_I387_VALID (1<<0) +#define ECF_VMX_GUEST (2<<0) unsigned long flags; execution_context_t cpu_ctxt; /* User-level CPU registers */ char fpu_ctxt[256]; /* User-level FPU registers */ diff --git a/xen/include/public/io/ioreq.h b/xen/include/public/io/ioreq.h new file mode 100644 index 0000000000..c10dc46995 --- /dev/null +++ b/xen/include/public/io/ioreq.h @@ -0,0 +1,59 @@ +/* + * ioreq.h: I/O request definitions for device models + * Copyright (c) 2004, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + * + */ + +#ifndef _IOREQ_H_ +#define _IOREQ_H_ + +#define IOREQ_READ 1 +#define IOREQ_WRITE 0 + +#define STATE_INVALID 0 +#define STATE_IOREQ_READY 1 +#define STATE_IOREQ_INPROCESS 2 +#define STATE_IORESP_READY 3 + +#define IOPACKET_PORT 2 + +/* VMExit dispatcher should cooperate with instruction decoder to + prepare this structure and notify service OS and DM by sending + virq */ +typedef struct { + u64 addr; /* physical address */ + u64 size; /* size in bytes */ + u64 count; /* for rep prefixes */ + union { + u64 data; /* data */ + void *pdata; /* pointer to data */ + } u; + u8 state:5; + u8 pdata_valid:1; /* if 1, use pdata above */ + u8 dir:1; /* 1=read, 0=write */ + u8 port_mm:1; /* 0=portio, 1=mmio */ +} ioreq_t; + +#define MAX_VECTOR 256 +#define BITS_PER_BYTE 8 +#define INTR_LEN (MAX_VECTOR/(BITS_PER_BYTE * sizeof(unsigned long))) + +typedef struct { + ioreq_t vp_ioreq; + unsigned long vp_intr[INTR_LEN]; +} vcpu_iodata_t; + +#endif /* _IOREQ_H_ */ diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h index ccda04ebe2..a138c64dda 100644 --- a/xen/include/xen/sched.h +++ b/xen/include/xen/sched.h @@ -26,8 +26,6 @@ extern unsigned long volatile jiffies; extern rwlock_t domlist_lock; -struct domain; - /* A global pointer to the initial domain (DOM0). */ extern struct domain *dom0; diff --git a/xen/include/xen/types.h b/xen/include/xen/types.h index 0299f74136..cd55353dfe 100644 --- a/xen/include/xen/types.h +++ b/xen/include/xen/types.h @@ -44,5 +44,7 @@ typedef __u32 uint32_t; typedef __u64 uint64_t; +struct domain; +struct exec_domain; #endif /* __TYPES_H__ */ -- 2.30.2